diff --git a/.gitattributes b/.gitattributes index b30baaf8f5de13f165f9d193062da3a0830e81eb..23642aaed8dab3b5fdf6b879652e061f528fae0b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -54,3 +54,15 @@ Model/last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text Model/tokenizer.json filter=lfs diff=lfs merge=lfs -text tokenizer.json filter=lfs diff=lfs merge=lfs -text last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/run-rdrftts8.wandb filter=lfs diff=lfs merge=lfs -text +Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/run-o5waoqcx.wandb filter=lfs diff=lfs merge=lfs -text +Model/Model/LLaMA-Factory/assets/wechat.jpg filter=lfs diff=lfs merge=lfs -text +Model/Model/LLaMA-Factory/assets/wechat_alaya.png filter=lfs diff=lfs merge=lfs -text +Model/Model/LLaMA-Factory/assets/wechat_npu.jpg filter=lfs diff=lfs merge=lfs -text +Model/Model/LLaMA-Factory/data/mllm_demo_data/1.mp3 filter=lfs diff=lfs merge=lfs -text +Model/Model/LLaMA-Factory/data/mllm_demo_data/1.mp4 filter=lfs diff=lfs merge=lfs -text +Model/Model/LLaMA-Factory/data/mllm_demo_data/2.avi filter=lfs diff=lfs merge=lfs -text +Model/Model/LLaMA-Factory/data/mllm_demo_data/3.flac filter=lfs diff=lfs merge=lfs -text +Model/Model/LLaMA-Factory/data/mllm_demo_data/3.mp4 filter=lfs diff=lfs merge=lfs -text +Model/Model/last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Model/Model/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/LLaMA-Factory/README.md b/LLaMA-Factory/README.md index 490f8c0a6b8d48401b8d2df014035b8ad4daf424..d86a5323605ac21138529c5590e7cfd414949ff8 100644 --- a/LLaMA-Factory/README.md +++ b/LLaMA-Factory/README.md @@ -262,6 +262,7 @@ Choose your path: | [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | | [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | | [Gemma 3](https://huggingface.co/google) | 1B/4B/12B/27B | gemma3/gemma (1B) | | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/THUDM) | 9B/32B | glm4/glmz1 | diff --git a/LLaMA-Factory/assets/wechat.jpg b/LLaMA-Factory/assets/wechat.jpg index 5a0f28f5840456da055e901a8712c21dd9b60845..3ec263ae86a310e381fde3aab66ff7507a91481f 100644 --- a/LLaMA-Factory/assets/wechat.jpg +++ b/LLaMA-Factory/assets/wechat.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90db00d9ffdfa2b364b61581c30c409100b8a3e8e25066b3a3217f5710d024eb -size 171500 +oid sha256:f2c75465c1e394897b7897eb7b368165da3086f52d3a642f45402d8a8cc3297e +size 168092 diff --git a/LLaMA-Factory/assets/wechat_npu.jpg b/LLaMA-Factory/assets/wechat_npu.jpg index b34e89f293cdb632ab4c3e54d01a8dfa9d7f335a..9c263c0709ee7c30ddc285db31fe3ab20903dcf1 100644 --- a/LLaMA-Factory/assets/wechat_npu.jpg +++ b/LLaMA-Factory/assets/wechat_npu.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8241933348dc7fd5863541aa7471e67a1164bb20d021c0a45af4177d40ab71b7 -size 172107 +oid sha256:857b271df0de601c4135ebd206d0bd2b44923d6ec27c57402e4ef81fca04ab4a +size 172660 diff --git a/LLaMA-Factory/src/llamafactory.egg-info/PKG-INFO b/LLaMA-Factory/src/llamafactory.egg-info/PKG-INFO index 196cc79bba9c8dbd817bb8b2a8e074591b7d88c5..2624aa3eca98fcd6088513868ff4d5282f7dafc7 100644 --- a/LLaMA-Factory/src/llamafactory.egg-info/PKG-INFO +++ b/LLaMA-Factory/src/llamafactory.egg-info/PKG-INFO @@ -386,6 +386,7 @@ Choose your path: | [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | | [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | | [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Falcon-H1](https://huggingface.co/tiiuae) | 0.5B/1.5B/3B/7B/34B | falcon_h1 | | [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | | [Gemma 3](https://huggingface.co/google) | 1B/4B/12B/27B | gemma3/gemma (1B) | | [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/THUDM) | 9B/32B | glm4/glmz1 | diff --git a/LLaMA-Factory/src/llamafactory/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/__pycache__/__init__.cpython-311.pyc index 0079644ad729d251814c9d472d47af0139931c52..f7c6eff0e2c46f9a5c37ac11c9d18e4435860c7a 100644 Binary files a/LLaMA-Factory/src/llamafactory/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc index e18c069708f67f5d0c8c71a180aec7c02790835b..37f9f9b2fd484f4d0716d7b2e61437922d4a666b 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc index f9a173951329196acbe1e40b87ab4654996ac2b2..9209e23a7990bf62334ff0e8e86b6c4aef583549 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc index 10480e4ff0aab962c3ffe4d2b559ce9019b12a89..30ec4e3d711bfa972291eb2558f762cc59aa0b2c 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc index 2a5c95819449c82f670a2851c6ea77f168088e3c..330ff0ccf6573393bbe5cb9371c6fc743e201d27 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc index 25f16b76bfb70299ef5b9af721d88d2d1469fb88..b3cc74ca6b9b9969508f9407ae5650e688469faa 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc index 8869c8af0da3f7bc30a5cafefbef76381e57c988..e27a535141e7b919c900d064d1a8e15db202640e 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/mm_plugin.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/mm_plugin.cpython-311.pyc index aed3e260c20e8d52a2436f7ea5252a947d9a4d33..277a8af0fe0362ebc09c9b25c429ca9c7c6f1309 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/mm_plugin.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/mm_plugin.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc index aa8a24b780d38b6768e316f8d0db3a9053776143..270d8278017a43db3cba7b71bf622f850e891c29 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/template.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/template.cpython-311.pyc index 83188dc52c47af0ab8412f596c7114981aff3df1..49bcbc6b29450c86224aece8bffe4c86c4daf435 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/template.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/template.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc index 9b74d2621c539a1b6f5636ad26509136a0101594..e208c8b1d68c43736af4743f1c744c1c6a65cd6c 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc index 78d6c81bb18e72cf5451fea51854f35d24206940..c116f400846aa892b6de55dcc661f544671e2c81 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc index d4e93eefdc4f0b2db959aaad58e3e11c63851a2d..5591584155e35aa67a54700947d8680560aa98e1 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc index 4d82be171aa75122ee4cbfba3bfc150f50deddc5..5ee084312d8ec3140ab6de2f500c463751a86fed 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc index 7c7c7577b68d6b432f19fe810ed343580734c1ac..e700bba34e8f073887596e98c7a62a24de130e09 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc index 508a1435e34b1ab8734c723956bbcdae1abd16f0..2d4fd068040f19f18864b36fc259a0eb1c02b7a6 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc index 480210f27ef861b74c892eac399e50bfa1fb0589..2e54c7e77a84c36915f68bbdbf08e3ff6479fc3a 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc index 7eb9fd84b0e883bdf195da6ede79a6d245576310..156d53dd47c1618235955887f799ef7183a3b1be 100644 Binary files a/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/data/template.py b/LLaMA-Factory/src/llamafactory/data/template.py index ac19b30b7f7868982907b691da23f4e66a7b29e8..4a7696626394ac7831b3e49190dc170de3c5c90b 100644 --- a/LLaMA-Factory/src/llamafactory/data/template.py +++ b/LLaMA-Factory/src/llamafactory/data/template.py @@ -916,6 +916,19 @@ register_template( ) +register_template( + name="falcon_h1", + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n"]), + format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="default"), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n"]), + format_tools=ToolFormatter(tool_format="default"), + format_prefix=EmptyFormatter(slots=[{"bos_token"}]), + stop_words=["<|im_end|>", "<|end_of_text|>"], +) + + register_template( name="fewshot", format_assistant=StringFormatter(slots=["{{content}}\n\n"]), diff --git a/LLaMA-Factory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc index 55e44ce3551d4fea85bb2002d04010f0883aa2af..948d06af2636a8f88dce9e66347715afed021419 100644 Binary files a/LLaMA-Factory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc index 1e978829aa689661b7ef97a60c22c3ff0636b9be..a46bf7439848753e5a7c126f99aae6dd4a7a2ec0 100644 Binary files a/LLaMA-Factory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc index 8f43590514a7c889847977e246ce8877f65dd3b0..85d07ffffef75041fdbe1cd4840e21e2ebd833c0 100644 Binary files a/LLaMA-Factory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc index 1af8a3327ab76ce5e2e6c3ffcda236660e0a3834..b280f13bb794542a9486e0dd85b1e812cabaa42e 100644 Binary files a/LLaMA-Factory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc index e5d1def642185baa71e319c275894a9a026d3042..5405da095bee9cc45dcef19b4827a6c00b9af5b8 100644 Binary files a/LLaMA-Factory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc index 5097d0b28ce36f21dea02d2a0defa8462523bc3b..f11c76672b59d41d1ea53198287ce7a29f29eec0 100644 Binary files a/LLaMA-Factory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc index 78d1693d283dd4cefa9f84558cd9d36e68610920..cbd207519befd1c4e1863cf50c2a33533049c924 100644 Binary files a/LLaMA-Factory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/extras/constants.py b/LLaMA-Factory/src/llamafactory/extras/constants.py index f582d1f0f462715e98990522db359b787ccc34e8..85c886c080e7dcfd38f00df6fb2af68eca306d46 100644 --- a/LLaMA-Factory/src/llamafactory/extras/constants.py +++ b/LLaMA-Factory/src/llamafactory/extras/constants.py @@ -633,6 +633,61 @@ register_model_group( template="falcon", ) +register_model_group( + models={ + "Falcon-H1-0.5B-Instruct": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-0.5B-Instruct", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-0.5B-Instruct", + }, + "Falcon-H1-0.5B-Base": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-0.5B-Base", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-0.5B-Base", + }, + "Falcon-H1-1.5B-Instruct": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Instruct", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Instruct", + }, + "Falcon-H1-1.5B-Base": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Base", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Base", + }, + "Falcon-H1-1.5B-Deep-Instruct": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-1.5B-Deep-Instruct", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Deep-Instruct", + }, + "Falcon-H1-1.5B-Deep-Base": { + DownloadSource.DEFAULT: "tiuae/Falcon-H1-1.5B-Deep-Base", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-1.5B-Deep-Base", + }, + "Falcon-H1-3B-Instruct": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-3B-Instruct", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-3B-Instruct", + }, + "Falcon-H1-3B-Base": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-3B-Base", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-3B-Base", + }, + "Falcon-H1-7B-Instruct": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-7B-Instruct", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-7B-Instruct", + }, + "Falcon-H1-7B-Base": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-7B-Base", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-7B-Base", + }, + "Falcon-H1-34B-Instruct": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-34B-Instruct", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-34B-Instruct", + }, + "Falcon-H1-34B-Base": { + DownloadSource.DEFAULT: "tiiuae/Falcon-H1-34B-Base", + DownloadSource.MODELSCOPE: "tiiuae/Falcon-H1-34B-Base", + }, + + }, + template="falcon_h1", +) + register_model_group( models={ diff --git a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc index 7db1eae5f6476490bb5e8cfc5f7968450d320f83..f3d0ea0eb7e68dd2a330bdc6bf1ee92d55038873 100644 Binary files a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc index d14298da60cb86f83b946856ef2063eb895e48ef..af6cd71f5e5e908f0166c69cee9421d120d33dee 100644 Binary files a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc index 16401a053a0369ba9a8648ae9668e5d0f5e82f57..a4f05a6a408af6f0f67eec13856b1befe17a3b0d 100644 Binary files a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc index d25caf8a6e0ccc9cdd53100af5a3afd5693c0feb..e3b7182ce52582548653080f77a8eaa9788c182a 100644 Binary files a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc index 1bab5829aa31b8c1c65c28322a53bab1ba4afd83..4c056837bda6d857e7127b5bda4ba075e9dbd18c 100644 Binary files a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc index a46f788cd5dff57df3bb508d2c28327312bd8918..eccb733ee2027dfcb3ff1d0f137b3bda103c737f 100644 Binary files a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc index 1c041eca5167eea8ea9054a33bce0021f1688c9c..2d685264ae9bbce5e81a5331846778db30e610ad 100644 Binary files a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc index 3d8f438db03ca43cab4b8ca74c3cc5bb3a4bbc45..7cef74e80b21dbaff482c58dbb4f2e4e3c37d4ea 100644 Binary files a/LLaMA-Factory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc index 366ef09e4fc5a918201a65c7e3dda7eedc89102d..2e8dea5550ee5e55a3a945b6c490fb9f888d1a97 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc index 85fecc6e387c2fabd67ed3e6c1792c320f2af83f..f36930c0d1cc94e894809577479c3dfaf90b1eb2 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc index 18dcdfaad729bbad9e4e474d18c471d42bf14bb8..dc9a59e1eb81f6545a3709fe1869446c532826e7 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc index 64e922fea36c20522b7bf9b63c772e73d4e0f2ec..4cb1c46b34cdce71c4bf1a652d0f0bb9a5c31f35 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc index cd18482d1203b12ac5fdf62c7156f3ad6ae34c9b..e0db7fa364bfa4e842c3357af394ddd47ea23699 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc index cb347857ac2ff6e95c9a8f2ac5fbce9f53c9ba30..2f83624f4b23279b17c34c7f56413d312a85c035 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc index 4eb58b86afca79384a318d7e5438beb1a30efc43..362e11ae56c7ad7864b0c1a122d8ae1e33a15167 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc index dc37b0129f56be95cab5825eedfdbb796e510f5b..18f3d073759bf48c07c6a3619fc0c70554dc424f 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc index a7c143c5fa729cbfde1190f24e6637f80e90a66e..28844de1bb0308233a88b67a11d4fad55d7181d5 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc index bf12fd68c0ec38c7ed9b74d25148be7fe7e33bf1..b95d0d5e929a7996755b3f742b97df051c2e1760 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc index d323710cf235f975f674b42b91bf8006dab30e30..26fc50487da8a281fd7f7c27f8c11f3da3c3ae73 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc index 6833531f7fc405cbac4064e3b6708ef0830e2584..1dbe4313c694d987e326336ac4839b17d77ad959 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc index 3660f04810e5b676c61a9f8800f9b44a6e661665..19c458f187fd22f622afcdbc39a63e2695f76184 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc index a2380d033789d60eb371e8f86554ca95a27705dd..76ba7297df68ff3f2ac7c0d6b71186f29dd25c32 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc index 4718e09e15404971b832c24ace6b9b0f020c4f24..b389a8c4d9137d70a90c244ffc81fc2cbcc61d01 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc index ee7838943fea9b2232c73b8aa8b9216c7eb5e7e2..4d4fa3368f7fbc5eaba0dca84526c78573bb9005 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc index e8103754c13bf84b5afc16f850e77f594cff6619..06e88f1b26a52bb8dfd47e5c97132d44d57bbd6c 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc index 1bc649eee463a422a94cb116325ceedcbda2003a..357c6233b1a28225d234485496905a5353af8f66 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc index 80dc712f411a7df0022e83be1801e5cfc8191b85..741faec8b85a2437e45daa69b3ad99a980516022 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc index 11956dd0ccf8bba74492e2f4e0b9b97ae3736969..ba5178f19c45ca3da41e56b077746d43f8cf36ee 100644 Binary files a/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc index f1e248f0492ab564979080bd34d4e2cf19ea2474..d07d43dce75ace7da83521c8a967c80e1920f449 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc index 290d7c4ca732170c285d7af86857f2003df879cb..39d573d06c8f12babc857b903ebe2c717e364cc1 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc index 2d6bd2c86d494399699cfde94eaf1e1903bfd089..0e029a8b468fc1535ff351600ec8f7c055203027 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc index ed32ca4d68eea9c5b3428d0d1febabc220b534a0..d0ab4447451398626137456f9934dcd361a3d34c 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc index d753b9a4de7eea76e2795be777803e8797705a60..ad4697c5983095802278d5af009246884c599e36 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/trainer.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/trainer.cpython-311.pyc index 587315b855b419fad9dc2aa8e40740fecc2a7e78..ec5713e74111869f7d35afb99372c76cc683a425 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/trainer.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/trainer.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc index 992a898f44bc6d945616eda65ebab0c09c8fa928..1a16bf0dbb497c5157f37a11ac45f97d8cf47015 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc index 7c09272257cb8b3a7f3b7ec1f41b7c6c1bcca43e..9251e0b47349a72554526c71b194f7c4c1928f3d 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc index 1e8fc514f0e50283075eb9800959ee06014e4f16..6cdf908b974911e7f1c12dece99ec52dd56ccd7c 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc index e59c7f7488f8ff5298960ff04c6ec77e924167a1..c49401f8489dac37b80d66de0965927d7cdc0bde 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc index 21dff9835ea7d30f11a93a3b2756ab7ecf779b2a..2d55908a2ec3b27475b833756d68353440239dd8 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc index 14be3537657d18e34aa8a84c883a42bdfc1fef7a..3ac2c11c567080327e1326db255931e8e947513a 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc index 5cb64fb3157e9965c605c4f92035a636e3221ff4..ad44f773afc12acf4b15f9071793a249b84ffadb 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc index c68121dc560210110b4e2306dabb968942fd9f68..abcb31ba995ca15c4f00ee29784aef3a91fd9984 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc index f97fd31e55364fb8dd1eea343cadadea517f7450..efcd55e396bdd3c3807c3926230e913160785d3a 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc index 875b84279297d061efc4acb714063304ce85043b..ed3c049f35022406680682300d9f0c6483ff8403 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc index 1a64340e5146a869a76f9492ebe5550ef249e0b0..268898144777adcc3dc47fd7471e885122434f0c 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc index 0dba75ea8f5a2605a66f9b2bff2013099e5972e2..c5bec5b9725a2608916148648a8f2b688d6f5fe6 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc index 9999071083feb54a6cdf64c5a55a93b295d5f75e..b217bfe38795f685a068a90521e137b85aa42e51 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc index bbeb9ef0a4e9e641fee455b001bbd1d96bb44c8b..70fe7ba2de8d3c7611ee841cb28b4919081718be 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc index 5a80f27dff7a4e4476094b5f83aa216e06672da8..9a1becb45d7ce8c283f2b1b5984ee7aca7b266e7 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc index 60f3e9c473cec66bff891a260cc70ecb1b0817cf..6c9657082d5832c022af696be1dd8c99a1050109 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc index a5610c8b2043bb0906c44d234653594b2ee9530b..9e1f022d32ddff26f740f17c43648f35cb9e4f8c 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc index 6731cf20c66bfa9ab48b54e6fa5b7a34d109ddc9..acbb290be21415b8f49372d0c48e5b260ab2382f 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc differ diff --git a/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc b/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc index e036b23ab59bcb7e2f394045354efc7f2b6e863b..dd7f4600d762eddbbd3671752bd8a754572e40ce 100644 Binary files a/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc and b/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc differ diff --git a/LLaMA-Factory/wandb/debug-internal.log b/LLaMA-Factory/wandb/debug-internal.log index d68dc2b105686882a2e9846d36d51b3aa2b97dfc..ffaaac2b37b898ad8428f03367923e6032efae5c 100644 --- a/LLaMA-Factory/wandb/debug-internal.log +++ b/LLaMA-Factory/wandb/debug-internal.log @@ -1,7 +1,7 @@ -{"time":"2025-06-18T02:04:45.169771821Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"/kaggle/working/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-core.log"} -{"time":"2025-06-18T02:04:45.427972883Z","level":"INFO","msg":"created new stream","id":"o5waoqcx"} -{"time":"2025-06-18T02:04:45.428081239Z","level":"INFO","msg":"stream: started","id":"o5waoqcx"} -{"time":"2025-06-18T02:04:45.428182138Z","level":"INFO","msg":"sender: started","stream_id":"o5waoqcx"} -{"time":"2025-06-18T02:04:45.428270431Z","level":"INFO","msg":"handler: started","stream_id":"o5waoqcx"} -{"time":"2025-06-18T02:04:45.428273846Z","level":"INFO","msg":"writer: Do: started","stream_id":"o5waoqcx"} -{"time":"2025-06-18T02:04:45.721935563Z","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-06-20T02:17:22.480161661Z","level":"INFO","msg":"stream: starting","core version":"0.20.1","symlink path":"/kaggle/working/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-core.log"} +{"time":"2025-06-20T02:17:22.670730676Z","level":"INFO","msg":"stream: created new stream","id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.670819673Z","level":"INFO","msg":"stream: started","id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.670905475Z","level":"INFO","msg":"handler: started","stream_id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.670915321Z","level":"INFO","msg":"writer: Do: started","stream_id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.670939142Z","level":"INFO","msg":"sender: started","stream_id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.907655773Z","level":"INFO","msg":"Starting system monitor"} diff --git a/LLaMA-Factory/wandb/debug.log b/LLaMA-Factory/wandb/debug.log index afdb722aae1e8a6304b2572da6e3ed067d2818bd..7055c326b654db1959c61db9fe2571bd84dab410 100644 --- a/LLaMA-Factory/wandb/debug.log +++ b/LLaMA-Factory/wandb/debug.log @@ -1,26 +1,25 @@ -2025-06-18 02:04:45,149 INFO MainThread:294 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9 -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Configure stats pid to 294 -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Loading settings from /kaggle/working/LLaMA-Factory/wandb/settings -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Loading settings from environment variables -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:setup_run_log_directory():662] Logging user logs to /kaggle/working/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug.log -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to /kaggle/working/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-internal.log -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():781] calling init triggers -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():786] wandb.init called with sweep_config: {} +2025-06-20 02:17:22,247 INFO MainThread:383 [wandb_setup.py:_flush():81] Current SDK version is 0.20.1 +2025-06-20 02:17:22,247 INFO MainThread:383 [wandb_setup.py:_flush():81] Configure stats pid to 383 +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_setup.py:_flush():81] Loading settings from /kaggle/working/LLaMA-Factory/wandb/settings +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_setup.py:_flush():81] Loading settings from environment variables +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /kaggle/working/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug.log +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /kaggle/working/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-internal.log +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:init():831] calling init triggers +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:init():836] wandb.init called with sweep_config: {} config: {'_wandb': {}} -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():809] starting backend -2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():813] sending inform_init request -2025-06-18 02:04:45,163 INFO MainThread:294 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-06-18 02:04:45,163 INFO MainThread:294 [wandb_init.py:init():823] backend started and connected -2025-06-18 02:04:45,172 INFO MainThread:294 [wandb_init.py:init():915] updated telemetry -2025-06-18 02:04:45,188 INFO MainThread:294 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout -2025-06-18 02:04:45,709 INFO MainThread:294 [wandb_init.py:init():1014] starting run threads in backend -2025-06-18 02:04:46,481 INFO MainThread:294 [wandb_run.py:_console_start():2454] atexit reg -2025-06-18 02:04:46,482 INFO MainThread:294 [wandb_run.py:_redirect():2306] redirect: wrap_raw -2025-06-18 02:04:46,482 INFO MainThread:294 [wandb_run.py:_redirect():2371] Wrapping output streams. -2025-06-18 02:04:46,482 INFO MainThread:294 [wandb_run.py:_redirect():2394] Redirects installed. -2025-06-18 02:04:46,499 INFO MainThread:294 [wandb_init.py:init():1056] run started, returning control to user process -2025-06-18 02:04:46,503 INFO MainThread:294 [wandb_run.py:_config_callback():1327] config_cb None None {'peft_config': {'default': {'task_type': , 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'revision': None, 'inference_mode': False, 'r': 64, 'target_modules': {'q_proj', 'up_proj', 'down_proj', 'k_proj', 'gate_proj', 'v_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 151936, 'max_position_embeddings': 131072, 'hidden_size': 1536, 'intermediate_size': 8960, 'num_hidden_layers': 28, 'num_attention_heads': 12, 'use_sliding_window': False, 'sliding_window': 131072, 'max_window_layers': 28, 'num_key_value_heads': 2, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen2.5-1.5B', '_attn_implementation_autoset': True, 'transformers_version': '4.51.3', 'model_type': 'qwen2', 'use_mrope': False, 'output_dir': '/kaggle/working/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 16, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/kaggle/working/runs/Jun18_02-03-42_79b2ce5216f6', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 200, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'Qwennn', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'tp_size': 0, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': '/kaggle/working/Model/last-checkpoint', 'hub_model_id': None, 'hub_strategy': 'checkpoint', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2048, 'generation_num_beams': None, 'generation_config': None, 'ray_run_name': None, 'ray_storage_path': './saves', 'ray_storage_filesystem': None, 'ray_num_workers': 1, 'resources_per_worker': {'GPU': 1}, 'placement_strategy': 'PACK', 'ray_init_kwargs': None} -2025-06-18 02:04:46,536 INFO MainThread:294 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 1617573376 - > -2025-06-18 02:04:46,536 INFO MainThread:294 [wandb_run.py:_config_callback():1327] config_cb model/num_parameters 1617573376 None -2025-06-18 02:04:46,542 INFO MainThread:294 [wandb_run.py:_config_callback():1327] config_cb None None {'model_args': {'model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'AUTO', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_cache': True, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': 'Youssef/QWEN_Arabic_Q&A', 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2048, 'block_diag_attn': False}, 'data_args': {'template': 'qwen', 'dataset': ['QAtrain'], 'eval_dataset': ['QAval'], 'dataset_dir': 'data', 'media_dir': 'data', 'cutoff_len': 2048, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': True, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 8, 'max_samples': None, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': False, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': True, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'lora_rank': 64, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'create_new_adapter': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'sft', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_muon': False, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:init():872] starting backend +2025-06-20 02:17:22,460 INFO MainThread:383 [wandb_init.py:init():875] sending inform_init request +2025-06-20 02:17:22,474 INFO MainThread:383 [wandb_init.py:init():883] backend started and connected +2025-06-20 02:17:22,479 INFO MainThread:383 [wandb_init.py:init():956] updated telemetry +2025-06-20 02:17:22,491 INFO MainThread:383 [wandb_init.py:init():980] communicating run to backend with 90.0 second timeout +2025-06-20 02:17:22,902 INFO MainThread:383 [wandb_init.py:init():1032] starting run threads in backend +2025-06-20 02:17:23,495 INFO MainThread:383 [wandb_run.py:_console_start():2453] atexit reg +2025-06-20 02:17:23,495 INFO MainThread:383 [wandb_run.py:_redirect():2301] redirect: wrap_raw +2025-06-20 02:17:23,495 INFO MainThread:383 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-06-20 02:17:23,495 INFO MainThread:383 [wandb_run.py:_redirect():2393] Redirects installed. +2025-06-20 02:17:23,498 INFO MainThread:383 [wandb_init.py:init():1078] run started, returning control to user process +2025-06-20 02:17:23,501 INFO MainThread:383 [wandb_run.py:_config_callback():1358] config_cb None None {'peft_config': {'default': {'task_type': , 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'revision': None, 'inference_mode': False, 'r': 64, 'target_modules': {'q_proj', 'gate_proj', 'v_proj', 'down_proj', 'k_proj', 'o_proj', 'up_proj'}, 'exclude_modules': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 151936, 'max_position_embeddings': 131072, 'hidden_size': 1536, 'intermediate_size': 8960, 'num_hidden_layers': 28, 'num_attention_heads': 12, 'use_sliding_window': False, 'sliding_window': 131072, 'max_window_layers': 28, 'num_key_value_heads': 2, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen2.5-1.5B', '_attn_implementation_autoset': True, 'transformers_version': '4.51.3', 'model_type': 'qwen2', 'use_mrope': False, 'output_dir': '/kaggle/working/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 16, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/kaggle/working/runs/Jun20_02-16-37_3d8055426195', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 200, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'Qwennn', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'tp_size': 0, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': '/kaggle/working/Model/last-checkpoint', 'hub_model_id': None, 'hub_strategy': 'checkpoint', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2048, 'generation_num_beams': None, 'generation_config': None, 'ray_run_name': None, 'ray_storage_path': './saves', 'ray_storage_filesystem': None, 'ray_num_workers': 1, 'resources_per_worker': {'GPU': 1}, 'placement_strategy': 'PACK', 'ray_init_kwargs': None} +2025-06-20 02:17:23,511 INFO MainThread:383 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 1617573376 - > +2025-06-20 02:17:23,511 INFO MainThread:383 [wandb_run.py:_config_callback():1358] config_cb model/num_parameters 1617573376 None +2025-06-20 02:17:23,513 INFO MainThread:383 [wandb_run.py:_config_callback():1358] config_cb None None {'model_args': {'model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'AUTO', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_cache': True, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': 'Youssef/QWEN_Arabic_Q&A', 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2048, 'block_diag_attn': False}, 'data_args': {'template': 'qwen', 'dataset': ['QAtrain'], 'eval_dataset': ['QAval'], 'dataset_dir': 'data', 'media_dir': 'data', 'cutoff_len': 2048, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': True, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 8, 'max_samples': None, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': False, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': True, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'lora_rank': 64, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'create_new_adapter': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'sft', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_muon': False, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} diff --git a/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/files/output.log b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..5929b4c381c45dde8041815a344291f218859af5 --- /dev/null +++ b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/files/output.log @@ -0,0 +1,52 @@ + 59%|████████████████████ | 1500/2547 [8:38:50<18:41:31, 64.27s/it][INFO|trainer.py:4307] 2025-06-20 10:56:14,287 >> +{'loss': 0.4541, 'grad_norm': 0.5041179060935974, 'learning_rate': 7.319880650722838e-05, 'epoch': 1.24} +{'loss': 0.4576, 'grad_norm': 0.5369197726249695, 'learning_rate': 7.01111937773246e-05, 'epoch': 1.3} +{'loss': 0.4472, 'grad_norm': 0.5211925506591797, 'learning_rate': 6.692915791902665e-05, 'epoch': 1.35} +{'loss': 0.4427, 'grad_norm': 0.5664705038070679, 'learning_rate': 6.366763876055806e-05, 'epoch': 1.41} +{'loss': 0.4395, 'grad_norm': 0.5420666337013245, 'learning_rate': 6.034194930847975e-05, 'epoch': 1.47} +{'loss': 0.4305, 'grad_norm': 0.558952271938324, 'learning_rate': 5.6967703852306786e-05, 'epoch': 1.53} +{'loss': 0.428, 'grad_norm': 0.510136067867279, 'learning_rate': 5.356074465458553e-05, 'epoch': 1.59} +{'loss': 0.4251, 'grad_norm': 0.506799578666687, 'learning_rate': 5.013706757062534e-05, 'epoch': 1.65} +{'loss': 0.4188, 'grad_norm': 0.5179591178894043, 'learning_rate': 4.671274694710388e-05, 'epoch': 1.71} +{'loss': 0.4177, 'grad_norm': 0.531908392906189, 'learning_rate': 4.3303860152151445e-05, 'epoch': 1.77} +***** Running Evaluation ***** +[INFO|trainer.py:4309] 2025-06-20 10:56:14,287 >> Num examples = 3020 +[INFO|trainer.py:4312] 2025-06-20 10:56:14,287 >> Batch size = 1 + 59%|████████████████████ | 1500/2547 [9:15:04<18:41:31, 64.27s/it][INFO|trainer.py:3984] 2025-06-20 11:32:28,468 >> Saving model checkpoint to /kaggle/working/checkpoint-1500 +[INFO|configuration_utils.py:693] 2025-06-20 11:32:28,617 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/config.json +{'eval_loss': 0.4321376383304596, 'eval_runtime': 2174.1694, 'eval_samples_per_second': 1.389, 'eval_steps_per_second': 0.695, 'epoch': 1.77} +[INFO|configuration_utils.py:765] 2025-06-20 11:32:28,619 >> Model config Qwen2Config { + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:2510] 2025-06-20 11:32:29,135 >> tokenizer config file saved in /kaggle/working/checkpoint-1500/tokenizer_config.json +[INFO|tokenization_utils_base.py:2519] 2025-06-20 11:32:29,136 >> Special tokens file saved in /kaggle/working/checkpoint-1500/special_tokens_map.json +[INFO|tokenization_utils_base.py:2510] 2025-06-20 11:32:30,350 >> tokenizer config file saved in /kaggle/working/tokenizer_config.json +[INFO|tokenization_utils_base.py:2519] 2025-06-20 11:32:30,351 >> Special tokens file saved in /kaggle/working/special_tokens_map.json +It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`huggingface-cli upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder. diff --git a/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/files/requirements.txt b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..760948d402d83af72da58518d1245b4694fa536a --- /dev/null +++ b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/files/requirements.txt @@ -0,0 +1,897 @@ +psutil==7.0.0 +setproctitle==1.2.2 +colorama==0.4.6 +pydantic==1.10.13 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-curand-cu12==10.3.5.147 +tomlkit==0.13.3 +sse-starlette==2.3.6 +gradio_client==1.10.1 +tyro==0.8.14 +fastapi==0.115.13 +starlette==0.46.2 +fsspec==2025.3.0 +pydantic_core==2.27.2 +fire==0.7.0 +wandb==0.20.1 +nvidia-cudnn-cu12==9.1.0.70 +av==14.4.0 +gradio==5.31.0 +semantic-version==2.10.0 +python-multipart==0.0.20 +groovy==0.1.2 +shtab==1.7.2 +llamafactory==0.9.4.dev0 +uvicorn==0.34.3 +ffmpy==0.6.0 +safehttpx==0.1.6 +nvidia-cublas-cu12==12.4.5.8 +ruff==0.12.0 +nvidia-nvjitlink-cu12==12.4.127 +trl==0.9.6 +nvidia-cufft-cu12==11.2.1.3 +google-cloud-bigquery==3.25.0 +bq_helper==0.4.1 +joblib==1.5.0 +nltk==3.9.1 +regex==2024.11.6 +click==8.1.8 +tqdm==4.67.1 +lightgbm==4.6.0 +siphash24==1.7 +pytools==2025.1.3 +pycuda==2025.1 +gensim==4.3.3 +torchtune==0.6.1 +tbb==2022.1.0 +mkl==2025.1.0 +tbb4py==2022.1.0 +shapely==2.1.0 +libpysal==4.9.2 +intel-cmplr-lib-ur==2024.2.0 +intel-cmplr-lib-rt==2024.2.0 +mkl-umath==0.1.1 +mkl-service==2.4.1 +mkl-random==1.2.4 +numpy==1.26.4 +intel-openmp==2024.2.0 +mkl-fft==1.3.8 +pynvjitlink-cu12==0.5.2 +tblib==3.1.0 +psutil==7.0.0 +raft-dask-cu12==25.2.0 +partd==1.4.2 +treelite==4.4.1 +dask==2024.12.1 +cupy-cuda12x==13.4.1 +cuda-python==12.9.0 +pynvml==12.0.0 +ucx-py-cu12==0.42.0 +libcudf-cu12==25.2.2 +nvidia-nvcomp-cu12==4.2.0.11 +numba-cuda==0.2.0 +libcuml-cu12==25.2.1 +msgpack==1.1.0 +importlib_metadata==8.7.0 +fastrlock==0.8.3 +libkvikio-cu12==25.2.1 +distributed==2024.12.1 +libcuvs-cu12==25.2.1 +libucx-cu12==1.18.1 +MarkupSafe==3.0.2 +dask-cudf-cu12==25.2.2 +dask-expr==1.1.21 +rich==14.0.0 +dask-cuda==25.2.0 +zict==3.0.0 +toolz==1.0.0 +cuml-cu12==25.2.1 +pylibcudf-cu12==25.2.2 +locket==1.0.0 +nvidia-ml-py==12.575.51 +packaging==25.0 +scipy==1.15.2 +zipp==3.21.0 +python-dateutil==2.9.0.post0 +markdown-it-py==3.0.0 +tzdata==2025.2 +mdurl==0.1.2 +six==1.17.0 +pylibraft-cu12==25.2.0 +rapids-dask-dependency==25.2.0 +numba==0.60.0 +urllib3==2.4.0 +cloudpickle==3.1.1 +nvtx==0.2.11 +cudf-cu12==25.2.2 +llvmlite==0.43.0 +cuda-bindings==12.9.0 +pandas==2.2.3 +Pygments==2.19.1 +pytz==2025.2 +cachetools==5.5.2 +Jinja2==3.1.6 +rmm-cu12==25.2.0 +libucxx-cu12==0.42.0 +PyYAML==6.0.2 +tornado==6.4.2 +cuvs-cu12==25.2.1 +libraft-cu12==25.2.0 +ucxx-cu12==0.42.0 +sortedcontainers==2.4.0 +typing_extensions==4.13.2 +pyarrow==19.0.1 +distributed-ucxx-cu12==0.42.0 +learntools==0.3.5 +pycparser==2.22 +annotated-types==0.7.0 +charset-normalizer==3.4.2 +kagglehub==0.3.12 +grpcio-status==1.49.0rc1 +frozenlist==1.6.0 +protobuf==3.20.3 +dnspython==2.7.0 +attrs==25.3.0 +in-toto-attestation==0.9.3 +typing-inspection==0.4.0 +id==1.5.0 +rsa==4.9.1 +PyJWT==2.10.1 +pyOpenSSL==25.0.0 +idna==3.10 +email_validator==2.2.0 +cffi==1.17.1 +certifi==2025.4.26 +sigstore==3.6.2 +multiprocess==0.70.16 +google-cloud-automl==1.0.1 +model-signing==1.0.1 +aiohttp==3.11.18 +aiohappyeyeballs==2.6.1 +googleapis-common-protos==1.70.0 +grpclib==0.4.8 +pyasn1==0.6.1 +huggingface-hub==0.31.1 +filelock==3.18.0 +pyasn1_modules==0.4.2 +hpack==4.1.0 +xxhash==3.5.0 +multidict==6.4.3 +propcache==0.3.1 +sigstore-protobuf-specs==0.3.2 +platformdirs==4.3.8 +rfc3161-client==1.0.1 +requests==2.32.3 +cryptography==44.0.3 +aiosignal==1.3.2 +yarl==1.20.0 +google-auth==2.40.1 +betterproto==2.0.0b6 +google-api-core==1.34.1 +datasets==3.6.0 +securesystemslib==1.3.0 +hyperframe==6.1.0 +rfc8785==0.1.4 +sigstore-rekor-types==0.0.18 +tuf==6.0.0 +grpcio==1.72.0rc1 +h2==4.2.0 +hf-xet==1.1.0 +dill==0.3.8 +tsfresh==0.21.0 +fiona==1.10.1 +urwid_readline==0.15.1 +coverage==7.8.0 +Wand==0.6.13 +xvfbwrapper==0.2.13 +qgrid==1.3.1 +jupyter_client==8.6.3 +woodwork==0.31.0 +overrides==7.7.0 +y-py==0.6.2 +ipywidgets==8.1.5 +ydata-profiling==4.16.1 +hep_ml==0.7.3 +scikit-multilearn==0.2.0 +urwid==3.0.2 +cytoolz==1.0.1 +pytesseract==0.3.13 +click-plugins==1.1.1 +onnx==1.17.0 +odfpy==1.4.1 +mpld3==0.5.10 +Boruta==0.4.3 +docstring-to-markdown==0.17 +fqdn==1.5.1 +torchinfo==1.8.0 +clint==0.5.1 +pybind11==2.13.6 +torchao==0.10.0 +PyWavelets==1.8.0 +python-lsp-server==1.12.2 +jupyter_server_terminals==0.5.3 +keras-core==0.1.7 +pandas-profiling==3.6.6 +asttokens==3.0.0 +scikit-surprise==1.1.4 +vtk==9.3.1 +jupyter-ydoc==0.2.5 +aiofiles==22.1.0 +transformers==4.51.3 +isoduration==20.11.0 +featuretools==1.31.0 +plotly-express==0.4.1 +pycryptodomex==3.22.0 +types-python-dateutil==2.9.0.20241206 +easyocr==1.7.2 +openslide-python==1.4.2 +slicer==0.0.7 +ray==2.46.0 +ImageHash==4.3.1 +pyemd==1.0.0 +fuzzywuzzy==0.18.0 +pyparsing==3.0.9 +xgboost==2.0.3 +pandasql==0.7.3 +update-checker==0.18.0 +pathos==0.3.2 +jupyter_server_fileid==0.9.3 +fasttext==0.9.3 +stopit==1.1.2 +haversine==2.9.0 +pox==0.3.6 +catboost==1.2.8 +colorlog==6.9.0 +jupyter_server==2.12.5 +geojson==3.2.0 +uri-template==1.3.0 +notebook==6.5.4 +pytorch-ignite==0.5.2 +fury==0.12.0 +igraph==0.11.8 +kornia_rs==0.1.9 +google-cloud-vision==3.10.1 +olefile==0.47 +semver==3.0.4 +gymnasium==0.29.0 +nvidia-cuda-cupti-cu12==12.4.127 +TPOT==0.12.1 +google-cloud-translate==3.12.1 +tensorflow-cloud==0.1.5 +torchdata==0.11.0 +shap==0.44.1 +rtree==1.4.0 +ghapi==1.0.6 +ninja==1.11.1.4 +torchmetrics==1.7.1 +pygltflib==1.16.4 +Cartopy==0.24.1 +nbdev==2.3.36 +jupyter-lsp==1.5.1 +pycryptodome==3.22.0 +gpxpy==1.6.2 +orderly-set==5.4.1 +pymongo==4.12.1 +mlcrate==0.2.0 +papermill==2.6.0 +jupyterlab==3.6.8 +args==0.1.0 +typing-inspect==0.9.0 +omegaconf==2.3.0 +PyUpSet==0.1.1.post7 +dacite==1.9.2 +qtconsole==5.6.1 +visions==0.8.1 +trx-python==0.3 +Chessnut==0.4.1 +beartype==0.20.2 +deap==1.4.3 +lml==0.2.0 +jmespath==1.0.1 +jupyterlab_server==2.27.3 +ypy-websocket==0.8.4 +ansicolors==1.1.8 +tensorflow_decision_forests==1.11.0 +path.py==12.5.0 +blobfile==3.0.0 +tensorflow-io==0.37.1 +pymc3==3.11.4 +wavio==0.0.9 +cligj==0.7.2 +pdf2image==1.17.0 +dipy==1.11.0 +pyaml==25.1.0 +pypdf==5.4.0 +line_profiler==4.2.0 +pydub==0.25.1 +botocore==1.38.11 +google-cloud-videointelligence==2.16.1 +pyLDAvis==3.4.1 +antlr4-python3-runtime==4.9.3 +Janome==0.5.0 +langid==1.1.6 +simpleitk==2.5.0 +pyclipper==1.3.0.post6 +kornia==0.8.1 +scikit-plot==0.3.7 +pydegensac==0.1.2 +jupyter_server_ydoc==0.8.0 +phik==0.12.4 +keras-tuner==1.4.7 +colorama==0.4.6 +scikit-learn-intelex==2025.5.0 +json5==0.12.0 +PyArabic==0.6.15 +ydf==0.9.0 +ujson==5.10.0 +boto3==1.38.11 +alembic==1.15.2 +annoy==1.17.3 +h2o==3.46.0.7 +optuna==4.3.0 +Pympler==1.1 +s3fs==0.4.2 +geopandas==0.14.4 +nbconvert==6.4.5 +scikit-learn==1.2.2 +emoji==2.14.1 +watchdog==6.0.0 +funcy==2.0 +deepdiff==8.4.2 +testpath==0.6.0 +rfc3986-validator==0.1.1 +nvidia-cuda-runtime-cu12==12.4.127 +nbclient==0.5.13 +Theano==1.0.5 +wurlitzer==3.1.1 +python-bidi==0.6.6 +pudb==2025.1 +plum-dispatch==2.5.7 +pytorch-lightning==2.5.1.post0 +squarify==0.4.4 +comm==0.2.2 +dataclasses-json==0.6.7 +jupyter-events==0.12.0 +pettingzoo==1.24.0 +lightning-utilities==0.14.3 +nilearn==0.10.4 +segment_anything==1.0 +kaggle-environments==1.16.11 +marshmallow==3.26.1 +eli5==0.13.0 +widgetsnbextension==4.0.14 +rgf-python==3.12.0 +ipympl==0.9.7 +tiktoken==0.9.0 +stable-baselines3==2.1.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +jedi==0.19.2 +jupyterlab-lsp==3.10.2 +python-lsp-jsonrpc==1.1.2 +aiosqlite==0.21.0 +QtPy==2.4.3 +pydicom==3.0.1 +multimethod==1.12 +docker==7.1.0 +ppft==1.7.7 +arrow==1.3.0 +isoweek==1.3.3 +texttable==1.7.0 +daal==2025.5.0 +sphinx-rtd-theme==0.2.4 +kt-legacy==1.0.5 +puremagic==1.29 +seaborn==0.12.2 +pyexcel-io==0.6.7 +matplotlib==3.7.2 +Shimmy==1.3.0 +rfc3339-validator==0.1.4 +category_encoders==2.7.0 +stumpy==1.13.0 +mamba==0.11.3 +path==17.1.0 +pyexcel-ods==0.6.0 +preprocessing==0.1.13 +lime==0.2.0.1 +htmlmin==0.1.12 +s3transfer==0.12.0 +cesium==0.12.4 +python-json-logger==3.3.0 +Theano-PyMC==1.1.2 +bayesian-optimization==2.0.3 +keras-cv==0.9.0 +gatspy==0.3 +hf_transfer==0.1.9 +scikit-optimize==0.10.2 +mne==1.9.0 +Mako==1.3.10 +mypy_extensions==1.1.0 +mistune==0.8.4 +setuptools-scm==8.3.1 +execnb==0.1.14 +openslide-bin==4.0.0.8 +google-colab==1.0.0 +mizani==0.13.2 +astunparse==1.6.3 +google-cloud-iam==2.18.3 +ipython==7.34.0 +jax==0.5.2 +pymc==5.21.2 +referencing==0.36.2 +roman-numerals-py==3.1.0 +soxr==0.5.0.post1 +libclang==18.1.1 +keras-nlp==0.18.1 +imageio==2.37.0 +geemap==0.35.3 +google-cloud-firestore==2.20.1 +clarabel==0.10.0 +h11==0.14.0 +db-dtypes==1.4.2 +imagesize==1.4.1 +py-cpuinfo==9.0.0 +debugpy==1.8.0 +stringzilla==3.12.3 +jupyterlab_pygments==0.3.0 +backcall==0.2.0 +tensorflow-hub==0.16.1 +earthengine-api==1.5.9 +requests-oauthlib==2.0.0 +scooby==0.10.0 +opencv-python-headless==4.11.0.86 +dopamine_rl==4.1.2 +etils==1.12.2 +setproctitle==1.3.5 +sklearn-compat==0.1.3 +ipython-genutils==0.2.0 +catalogue==2.0.10 +sphinxcontrib-devhelp==2.0.0 +sklearn-pandas==2.2.0 +Markdown==3.7 +sphinxcontrib-qthelp==2.0.0 +google-auth-httplib2==0.2.0 +Flask==3.1.0 +preshed==3.0.9 +google-cloud-resource-manager==1.14.2 +marisa-trie==1.2.1 +google-cloud-core==2.4.3 +ipyleaflet==0.19.2 +chardet==5.2.0 +jupyter_core==5.7.2 +simple-parsing==0.1.7 +matplotlib-venn==1.1.2 +gin-config==0.5.0 +SQLAlchemy==2.0.40 +ipython-sql==0.5.0 +toml==0.10.2 +kaggle==1.7.4.2 +jsonpointer==3.0.0 +ndindex==1.9.2 +astropy-iers-data==0.2025.3.31.0.36.18 +proglog==0.1.11 +tensorflow-io-gcs-filesystem==0.37.1 +simplejson==3.20.1 +datascience==0.17.6 +alabaster==1.0.0 +langchain-text-splitters==0.3.7 +pygit2==1.17.0 +pyshp==2.3.1 +PyGObject==3.42.0 +pytest==8.3.5 +gspread==6.2.0 +spacy-legacy==3.0.12 +diffusers==0.32.2 +librosa==0.11.0 +ibis-framework==9.5.0 +fastcore==1.7.29 +requests-toolbelt==1.0.0 +types-pytz==2025.2.0.20250326 +PyDrive==1.3.1 +google-cloud-functions==1.20.2 +imutils==0.5.4 +sentence-transformers==3.4.1 +opt_einsum==3.4.0 +moviepy==1.0.3 +en_core_web_sm==3.8.0 +langchain-core==0.3.50 +nbclassic==1.2.0 +importlib_resources==6.5.2 +xarray-einstats==0.8.0 +lazy_loader==0.4 +ipyevents==2.0.2 +immutabledict==4.2.1 +music21==9.3.0 +openai==1.70.0 +sqlglot==25.20.2 +ale-py==0.10.2 +linkify-it-py==2.0.3 +scikit-image==0.25.2 +language_data==1.3.0 +treescope==0.1.9 +nvidia-cuda-nvcc-cu12==12.5.82 +libcugraph-cu12==25.2.0 +google-crc32c==1.7.1 +google-cloud-language==2.17.1 +torchsummary==1.5.1 +webencodings==0.5.1 +webcolors==24.11.1 +pydot==3.0.4 +orbax-checkpoint==0.11.10 +google-cloud-dataproc==5.18.1 +jellyfish==1.1.0 +gym==0.25.2 +flax==0.10.5 +cramjam==2.9.1 +gdown==5.2.0 +httpimport==1.4.1 +pymystem3==0.2.0 +parso==0.8.4 +py4j==0.10.9.7 +nx-cugraph-cu12==25.2.0 +entrypoints==0.4 +fastprogress==1.0.3 +torchaudio==2.6.0+cu124 +pyogrio==0.10.0 +bigframes==1.42.0 +oauthlib==3.2.2 +tifffile==2025.3.30 +firebase-admin==6.7.0 +fastjsonschema==2.21.1 +psycopg2==2.9.10 +missingno==0.5.2 +pandas-datareader==0.10.0 +google-spark-connect==0.5.2 +Deprecated==1.2.18 +pooch==1.8.2 +cycler==0.12.1 +tensorboard==2.18.0 +tcmlib==1.3.0 +pyproj==3.7.1 +arviz==0.21.0 +duckdb==1.2.1 +inflect==7.5.0 +argon2-cffi-bindings==21.2.0 +namex==0.0.8 +nvidia-nccl-cu12==2.21.5 +rpy2==3.5.17 +torch==2.6.0+cu124 +argon2-cffi==23.1.0 +opencv-contrib-python==4.11.0.86 +atpublic==5.1 +sphinxcontrib-applehelp==2.0.0 +google-cloud-spanner==3.53.0 +langsmith==0.3.23 +umap-learn==0.5.7 +yfinance==0.2.55 +bleach==6.2.0 +langchain==0.3.22 +jax-cuda12-plugin==0.5.1 +optree==0.14.1 +defusedxml==0.7.1 +sphinxcontrib-serializinghtml==2.0.0 +more-itertools==10.6.0 +python-utils==3.9.1 +timm==1.0.15 +Pyomo==6.8.2 +pydotplus==2.0.2 +ml-dtypes==0.4.1 +peewee==3.17.9 +google-pasta==0.2.0 +pyzmq==24.0.1 +cmdstanpy==1.2.5 +ipyparallel==8.8.0 +parsy==2.1 +bqplot==0.12.44 +spacy-loggers==1.0.5 +google-ai-generativelanguage==0.6.15 +panel==1.6.2 +prophet==1.1.6 +pydata-google-auth==1.9.1 +anyio==4.9.0 +absl-py==1.4.0 +openpyxl==3.1.5 +vega-datasets==0.9.0 +mpmath==1.3.0 +frozendict==2.4.6 +opencv-python==4.11.0.86 +cudf-polars-cu12==25.2.2 +folium==0.19.5 +mdit-py-plugins==0.4.2 +zstandard==0.23.0 +google-cloud-aiplatform==1.87.0 +langcodes==3.5.0 +pytensor==2.30.2 +blinker==1.9.0 +xyzservices==2025.1.0 +googledrivedownloader==1.1.0 +thinc==8.3.4 +google-generativeai==0.8.4 +et_xmlfile==2.0.0 +jieba==0.42.1 +pluggy==1.5.0 +hyperopt==0.2.7 +python-louvain==0.16 +google-auth-oauthlib==1.2.1 +soupsieve==2.6 +PyDrive2==1.21.3 +simsimd==6.2.1 +umf==0.10.0 +peft==0.14.0 +imbalanced-learn==0.13.0 +wcwidth==0.2.13 +narwhals==1.33.0 +typeguard==4.4.2 +blosc2==3.2.1 +spanner-graph-notebook==1.1.6 +progressbar2==4.5.0 +pexpect==4.9.0 +ptyprocess==0.7.0 +pygame==2.6.1 +docker-pycreds==0.4.0 +Cython==3.0.12 +shellingham==1.5.4 +jiter==0.9.0 +CacheControl==0.14.2 +prometheus_client==0.21.1 +nbformat==5.10.4 +python-snappy==0.7.3 +torchvision==0.21.0+cu124 +tensorflow-metadata==1.17.0 +nest-asyncio==1.6.0 +nibabel==5.3.2 +cmake==3.31.6 +multipledispatch==1.0.0 +tf_keras==2.18.0 +cloudpathlib==0.21.0 +networkx==3.4.2 +gcsfs==2025.3.2 +sentencepiece==0.2.0 +einops==0.8.1 +plotly==5.24.1 +bokeh==3.6.3 +pycairo==1.27.0 +ipytree==0.2.2 +python-box==7.3.2 +tensorflow-datasets==4.9.8 +graphviz==0.20.3 +scs==3.2.7.post2 +pillow==11.1.0 +google-api-python-client==2.164.0 +textblob==0.19.0 +PyOpenGL==3.1.9 +google-cloud-bigtable==2.30.0 +decorator==4.4.2 +google-cloud-datastore==2.20.2 +docstring_parser==0.16 +pickleshare==0.7.5 +fastai==2.7.19 +wrapt==1.17.2 +google-cloud-storage==2.19.0 +GDAL==3.6.4 +wasabi==1.1.3 +spacy==3.8.5 +blis==1.2.1 +tensorflow-text==2.18.1 +optax==0.2.4 +gast==0.6.0 +Werkzeug==3.1.3 +colorcet==3.1.0 +python-slugify==8.0.4 +cvxpy==1.6.4 +miniKanren==1.0.3 +traitlets==5.7.1 +sqlparse==0.5.3 +terminado==0.18.1 +holidays==0.69 +sphinxcontrib-htmlhelp==2.1.0 +orjson==3.10.16 +grpc-interceptor==0.15.4 +geocoder==1.38.1 +pyviz_comms==3.0.4 +babel==2.17.0 +jax-cuda12-pjrt==0.5.1 +ply==3.11 +audioread==3.0.1 +docutils==0.21.2 +osqp==1.0.3 +distro==1.9.0 +tf-slim==1.1.0 +tokenizers==0.21.1 +tzlocal==5.3.1 +cons==0.4.6 +rpds-py==0.24.0 +geographiclib==2.0 +matplotlib-inline==0.1.7 +editdistance==0.8.1 +httpcore==1.0.7 +h5py==3.13.0 +tabulate==0.9.0 +statsmodels==0.14.4 +holoviews==1.20.2 +sentry-sdk==2.25.1 +dlib==19.24.6 +community==1.0.0b1 +bigquery-magics==0.9.0 +gym-notices==0.0.8 +notebook_shim==0.2.4 +soundfile==0.13.1 +pyspark==3.5.5 +itsdangerous==2.2.0 +jsonpatch==1.33 +plotnine==0.14.5 +prompt_toolkit==3.0.50 +traittypes==0.2.1 +autograd==1.7.0 +text-unidecode==1.3 +pycocotools==2.0.8 +jsonpickle==4.0.5 +weasel==0.4.1 +srsly==2.5.1 +wordcloud==1.9.4 +eerepr==0.1.1 +cymem==2.0.11 +smart-open==7.1.0 +patsy==1.0.1 +beautifulsoup4==4.13.3 +opentelemetry-sdk==1.31.1 +tables==3.10.2 +altair==5.5.0 +grpc-google-iam-v1==0.14.2 +cufflinks==0.17.3 +cvxopt==1.3.2 +triton==3.2.0 +PySocks==1.7.1 +uc-micro-py==1.0.3 +proto-plus==1.26.1 +Sphinx==8.2.3 +fonttools==4.57.0 +xlrd==2.0.1 +pynndescent==0.5.13 +numexpr==2.10.2 +array_record==0.7.1 +h5netcdf==1.6.1 +promise==2.3 +threadpoolctl==3.6.0 +Send2Trash==1.8.3 +sniffio==1.3.1 +httplib2==0.22.0 +jupyterlab_widgets==3.0.13 +chex==0.1.89 +confection==0.1.5 +uritemplate==4.1.1 +stanio==0.5.1 +easydict==1.13 +future==1.0.0 +tensorflow==2.18.0 +websocket-client==1.8.0 +flatbuffers==25.2.10 +Bottleneck==1.4.2 +kiwisolver==1.4.8 +snowballstemmer==2.2.0 +colour==0.1.5 +google-genai==1.9.0 +hdbscan==0.8.40 +sphinxcontrib-jsmath==1.0.1 +google-resumable-media==2.7.2 +murmurhash==1.0.12 +portpicker==1.5.2 +Farama-Notifications==0.0.4 +accelerate==1.5.2 +jaxlib==0.5.1 +sympy==1.13.1 +ipykernel==6.17.1 +pathlib==1.0.1 +websockets==15.0.1 +pandas-stubs==2.2.2.240909 +ratelim==0.1.6 +google-cloud-bigquery-connection==1.18.2 +greenlet==3.1.1 +multitasking==0.0.11 +astropy==7.0.1 +imageio-ffmpeg==0.6.0 +opentelemetry-api==1.31.1 +pyperclip==1.9.0 +jsonschema-specifications==2024.10.1 +tinycss2==1.4.0 +keras==3.8.0 +pylibcugraph-cu12==25.2.0 +tenacity==9.1.2 +cyipopt==1.5.0 +polars==1.21.0 +oauth2client==4.1.3 +typer==0.15.2 +lxml==5.3.1 +etuples==0.3.9 +gspread-dataframe==4.0.0 +albumentations==2.0.5 +geopy==2.4.1 +logical-unification==0.4.6 +natsort==8.4.0 +prettytable==3.16.0 +GitPython==3.1.44 +pyerfa==2.0.1.5 +param==2.2.0 +keras-hub==0.18.1 +xarray==2025.1.2 +pandas-gbq==0.28.0 +google-cloud-pubsub==2.29.0 +gitdb==4.0.12 +safetensors==0.5.3 +httpx==0.28.1 +jsonschema==4.23.0 +nvidia-nvtx-cu12==12.4.127 +albucore==0.0.23 +tweepy==4.15.0 +fastdownload==0.0.7 +highspy==1.9.0 +jupyter-console==6.1.0 +branca==0.8.1 +pandocfilters==1.5.1 +yellowbrick==1.5 +opentelemetry-semantic-conventions==0.52b1 +nvidia-cusparselt-cu12==0.6.2 +contourpy==1.3.1 +tensorboard-data-server==0.7.2 +google==2.0.3 +jupyter-leaflet==0.19.2 +mlxtend==0.23.4 +humanize==4.12.2 +smmap==5.0.2 +tensorstore==0.1.73 +wheel==0.45.1 +glob2==0.7 +tensorflow-probability==0.25.0 +termcolor==3.0.1 +colorlover==0.3.0 +ipyfilechooser==0.6.0 +iniconfig==2.1.0 +dm-tree==0.1.9 +html5lib==1.1 +python-apt==0.0.0 +setuptools==75.2.0 +types-setuptools==78.1.0.20250329 +requirements-parser==0.9.0 +pip==24.1.2 +llamafactory==0.9.4.dev0 +PyGObject==3.42.1 +blinker==1.4 +jeepney==0.7.1 +six==1.16.0 +oauthlib==3.2.0 +wadllib==1.3.6 +launchpadlib==1.10.16 +dbus-python==1.2.18 +PyJWT==2.3.0 +importlib-metadata==4.6.4 +httplib2==0.20.2 +zipp==1.0.0 +pyparsing==2.4.7 +Markdown==3.3.6 +python-apt==2.4.0+ubuntu4 +Mako==1.1.3 +lazr.restfulclient==0.14.4 +SecretStorage==3.3.1 +distro==1.7.0 +lazr.uri==1.0.6 +more-itertools==8.10.0 +MarkupSafe==2.0.1 +cryptography==3.4.8 +keyring==23.5.0 +packaging==24.1 +inflect==7.3.1 +autocommand==2.2.2 +typeguard==4.3.0 +jaraco.text==3.12.1 +importlib_resources==6.4.0 +wheel==0.43.0 +zipp==3.19.2 +platformdirs==4.2.2 +importlib_metadata==8.0.0 +tomli==2.0.1 +jaraco.collections==5.1.0 +more-itertools==10.3.0 +typing_extensions==4.12.2 +backports.tarfile==1.2.0 +jaraco.functools==4.0.1 +jaraco.context==5.3.0 diff --git a/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/files/wandb-metadata.json b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..3b47c077d79dcc3b1facf4daee1cbf1ca17a0ae4 --- /dev/null +++ b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/files/wandb-metadata.json @@ -0,0 +1,53 @@ +{ + "os": "Linux-6.6.56+-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-06-20T02:17:22.474691Z", + "args": [ + "examples/train_lora/QA.yaml" + ], + "program": "/kaggle/working/LLaMA-Factory/src/llamafactory/launcher.py", + "codePath": "src/llamafactory/launcher.py", + "git": { + "remote": "https://github.com/hiyouga/LLaMA-Factory.git", + "commit": "3a119ed5a2c5aba22e0eca09ea7b11b9d6f7df01" + }, + "email": "youssefhassan437972@gmail.com", + "root": "/kaggle/working/LLaMA-Factory", + "host": "3d8055426195", + "executable": "/usr/bin/python3", + "codePathLocal": "src/llamafactory/launcher.py", + "cpu_count": 2, + "cpu_count_logical": 4, + "gpu": "Tesla T4", + "gpu_count": 2, + "disk": { + "/": { + "total": "8656922775552", + "used": "6838344368128" + } + }, + "memory": { + "total": "33662332928" + }, + "cpu": { + "count": 2, + "countLogical": 4 + }, + "gpu_nvidia": [ + { + "name": "Tesla T4", + "memoryTotal": "16106127360", + "cudaCores": 2560, + "architecture": "Turing", + "uuid": "GPU-db1bf042-fc30-6308-a125-ff5352acb0ce" + }, + { + "name": "Tesla T4", + "memoryTotal": "16106127360", + "cudaCores": 2560, + "architecture": "Turing", + "uuid": "GPU-ea6e25c2-cb83-4a13-1c6a-4d1b25a46ad0" + } + ], + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-core.log b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..21498bd247fa50ab1f4dc09fa5d47978afcd7e1d --- /dev/null +++ b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-core.log @@ -0,0 +1,6 @@ +{"time":"2025-06-20T02:17:22.287663941Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpgaqawtj_/port-383.txt","pid":383,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2025-06-20T02:17:22.297326798Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":39333,"Zone":""}} +{"time":"2025-06-20T02:17:22.297459013Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":383} +{"time":"2025-06-20T02:17:22.460573707Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:43590"} +{"time":"2025-06-20T02:17:22.479978193Z","level":"INFO","msg":"handleInformInit: received","streamId":"rdrftts8","id":"127.0.0.1:43590"} +{"time":"2025-06-20T02:17:22.670829055Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"rdrftts8","id":"127.0.0.1:43590"} diff --git a/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-internal.log b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..ffaaac2b37b898ad8428f03367923e6032efae5c --- /dev/null +++ b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-06-20T02:17:22.480161661Z","level":"INFO","msg":"stream: starting","core version":"0.20.1","symlink path":"/kaggle/working/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-core.log"} +{"time":"2025-06-20T02:17:22.670730676Z","level":"INFO","msg":"stream: created new stream","id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.670819673Z","level":"INFO","msg":"stream: started","id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.670905475Z","level":"INFO","msg":"handler: started","stream_id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.670915321Z","level":"INFO","msg":"writer: Do: started","stream_id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.670939142Z","level":"INFO","msg":"sender: started","stream_id":"rdrftts8"} +{"time":"2025-06-20T02:17:22.907655773Z","level":"INFO","msg":"Starting system monitor"} diff --git a/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug.log b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..7055c326b654db1959c61db9fe2571bd84dab410 --- /dev/null +++ b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug.log @@ -0,0 +1,25 @@ +2025-06-20 02:17:22,247 INFO MainThread:383 [wandb_setup.py:_flush():81] Current SDK version is 0.20.1 +2025-06-20 02:17:22,247 INFO MainThread:383 [wandb_setup.py:_flush():81] Configure stats pid to 383 +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_setup.py:_flush():81] Loading settings from /kaggle/working/LLaMA-Factory/wandb/settings +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_setup.py:_flush():81] Loading settings from environment variables +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:setup_run_log_directory():703] Logging user logs to /kaggle/working/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug.log +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:setup_run_log_directory():704] Logging internal logs to /kaggle/working/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/logs/debug-internal.log +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:init():831] calling init triggers +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:init():836] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-20 02:17:22,248 INFO MainThread:383 [wandb_init.py:init():872] starting backend +2025-06-20 02:17:22,460 INFO MainThread:383 [wandb_init.py:init():875] sending inform_init request +2025-06-20 02:17:22,474 INFO MainThread:383 [wandb_init.py:init():883] backend started and connected +2025-06-20 02:17:22,479 INFO MainThread:383 [wandb_init.py:init():956] updated telemetry +2025-06-20 02:17:22,491 INFO MainThread:383 [wandb_init.py:init():980] communicating run to backend with 90.0 second timeout +2025-06-20 02:17:22,902 INFO MainThread:383 [wandb_init.py:init():1032] starting run threads in backend +2025-06-20 02:17:23,495 INFO MainThread:383 [wandb_run.py:_console_start():2453] atexit reg +2025-06-20 02:17:23,495 INFO MainThread:383 [wandb_run.py:_redirect():2301] redirect: wrap_raw +2025-06-20 02:17:23,495 INFO MainThread:383 [wandb_run.py:_redirect():2370] Wrapping output streams. +2025-06-20 02:17:23,495 INFO MainThread:383 [wandb_run.py:_redirect():2393] Redirects installed. +2025-06-20 02:17:23,498 INFO MainThread:383 [wandb_init.py:init():1078] run started, returning control to user process +2025-06-20 02:17:23,501 INFO MainThread:383 [wandb_run.py:_config_callback():1358] config_cb None None {'peft_config': {'default': {'task_type': , 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'revision': None, 'inference_mode': False, 'r': 64, 'target_modules': {'q_proj', 'gate_proj', 'v_proj', 'down_proj', 'k_proj', 'o_proj', 'up_proj'}, 'exclude_modules': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 151936, 'max_position_embeddings': 131072, 'hidden_size': 1536, 'intermediate_size': 8960, 'num_hidden_layers': 28, 'num_attention_heads': 12, 'use_sliding_window': False, 'sliding_window': 131072, 'max_window_layers': 28, 'num_key_value_heads': 2, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen2.5-1.5B', '_attn_implementation_autoset': True, 'transformers_version': '4.51.3', 'model_type': 'qwen2', 'use_mrope': False, 'output_dir': '/kaggle/working/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 16, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/kaggle/working/runs/Jun20_02-16-37_3d8055426195', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 200, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'Qwennn', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'tp_size': 0, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': '/kaggle/working/Model/last-checkpoint', 'hub_model_id': None, 'hub_strategy': 'checkpoint', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2048, 'generation_num_beams': None, 'generation_config': None, 'ray_run_name': None, 'ray_storage_path': './saves', 'ray_storage_filesystem': None, 'ray_num_workers': 1, 'resources_per_worker': {'GPU': 1}, 'placement_strategy': 'PACK', 'ray_init_kwargs': None} +2025-06-20 02:17:23,511 INFO MainThread:383 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 1617573376 - > +2025-06-20 02:17:23,511 INFO MainThread:383 [wandb_run.py:_config_callback():1358] config_cb model/num_parameters 1617573376 None +2025-06-20 02:17:23,513 INFO MainThread:383 [wandb_run.py:_config_callback():1358] config_cb None None {'model_args': {'model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'AUTO', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_cache': True, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': 'Youssef/QWEN_Arabic_Q&A', 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2048, 'block_diag_attn': False}, 'data_args': {'template': 'qwen', 'dataset': ['QAtrain'], 'eval_dataset': ['QAval'], 'dataset_dir': 'data', 'media_dir': 'data', 'cutoff_len': 2048, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': True, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 8, 'max_samples': None, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': False, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': True, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'lora_rank': 64, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'create_new_adapter': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'sft', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_muon': False, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} diff --git a/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/run-rdrftts8.wandb b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/run-rdrftts8.wandb new file mode 100644 index 0000000000000000000000000000000000000000..52a2344051164d8c6802034aad0d3e331cb99f7b --- /dev/null +++ b/LLaMA-Factory/wandb/run-20250620_021722-rdrftts8/run-rdrftts8.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f63d5a869f82244af91b1de474c8ac786aace50a9155ec03a7696d92e881bb0e +size 3211264 diff --git a/Model/.gitattributes b/Model/.gitattributes index fb5ecab7a422a33732ea9d9cab63b23d83e482c0..b30baaf8f5de13f165f9d193062da3a0830e81eb 100644 --- a/Model/.gitattributes +++ b/Model/.gitattributes @@ -33,8 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text -tokenizer.json filter=lfs diff=lfs merge=lfs -text -last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text LLaMA-Factory/assets/wechat.jpg filter=lfs diff=lfs merge=lfs -text LLaMA-Factory/assets/wechat_alaya.png filter=lfs diff=lfs merge=lfs -text LLaMA-Factory/assets/wechat_npu.jpg filter=lfs diff=lfs merge=lfs -text @@ -43,3 +41,16 @@ LLaMA-Factory/data/mllm_demo_data/1.mp4 filter=lfs diff=lfs merge=lfs -text LLaMA-Factory/data/mllm_demo_data/2.avi filter=lfs diff=lfs merge=lfs -text LLaMA-Factory/data/mllm_demo_data/3.flac filter=lfs diff=lfs merge=lfs -text LLaMA-Factory/data/mllm_demo_data/3.mp4 filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/run-o5waoqcx.wandb filter=lfs diff=lfs merge=lfs -text +Model/LLaMA-Factory/assets/wechat.jpg filter=lfs diff=lfs merge=lfs -text +Model/LLaMA-Factory/assets/wechat_alaya.png filter=lfs diff=lfs merge=lfs -text +Model/LLaMA-Factory/assets/wechat_npu.jpg filter=lfs diff=lfs merge=lfs -text +Model/LLaMA-Factory/data/mllm_demo_data/1.mp3 filter=lfs diff=lfs merge=lfs -text +Model/LLaMA-Factory/data/mllm_demo_data/1.mp4 filter=lfs diff=lfs merge=lfs -text +Model/LLaMA-Factory/data/mllm_demo_data/2.avi filter=lfs diff=lfs merge=lfs -text +Model/LLaMA-Factory/data/mllm_demo_data/3.flac filter=lfs diff=lfs merge=lfs -text +Model/LLaMA-Factory/data/mllm_demo_data/3.mp4 filter=lfs diff=lfs merge=lfs -text +Model/last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text +Model/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/Model/LLaMA-Factory/.github/workflows/docker.yml b/Model/LLaMA-Factory/.github/workflows/docker.yml index c3c3800591c1f36bdcd83608bc52fdeb0bc635c5..90e9ef87f8d506fa03fa5c8830cf56a93e207284 100644 --- a/Model/LLaMA-Factory/.github/workflows/docker.yml +++ b/Model/LLaMA-Factory/.github/workflows/docker.yml @@ -43,6 +43,16 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + + - name: Get llamafactory version + id: version + run: | + echo "tag=$(python setup.py --version | sed 's/\.dev0//')" >> "$GITHUB_OUTPUT" + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -61,6 +71,8 @@ jobs: build-args: | EXTRAS=metrics,deepspeed,liger-kernel push: ${{ github.event_name != 'pull_request' }} - tags: docker.io/hiyouga/llamafactory:latest + tags: | + docker.io/hiyouga/llamafactory:latest + docker.io/hiyouga/llamafactory:${{ steps.version.outputs.tag }} cache-from: type=gha cache-to: type=gha,mode=max diff --git a/Model/LLaMA-Factory/README.md b/Model/LLaMA-Factory/README.md index db981f9fef21b3ccf642a5a29ff9046a0b76fc8d..490f8c0a6b8d48401b8d2df014035b8ad4daf424 100644 --- a/Model/LLaMA-Factory/README.md +++ b/Model/LLaMA-Factory/README.md @@ -5,7 +5,7 @@ [![GitHub contributors](https://img.shields.io/github/contributors/hiyouga/LLaMA-Factory?color=orange)](https://github.com/hiyouga/LLaMA-Factory/graphs/contributors) [![GitHub workflow](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml/badge.svg)](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml) [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/) -[![Citation](https://img.shields.io/badge/citation-561-green)](https://scholar.google.com/scholar?cites=12620864006390196564) +[![Citation](https://img.shields.io/badge/citation-614-green)](https://scholar.google.com/scholar?cites=12620864006390196564) [![Docker Pulls](https://img.shields.io/docker/pulls/hiyouga/llamafactory)](https://hub.docker.com/r/hiyouga/llamafactory/tags) [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai) @@ -55,7 +55,7 @@ Choose your path: - **Colab (free)**: https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing - **Local machine**: Please refer to [usage](#getting-started) - **PAI-DSW (free trial)**: https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory -- **Alaya NeW (cloud GPU deal)**: https://docs.alayanew.com/docs/documents/newActivities/llamafactory/?utm_source=LLaMA-Factory +- **Alaya NeW (cloud GPU deal)**: https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory > [!NOTE] > Except for the above links, all other websites are unauthorized third-party websites. Please carefully use them. @@ -105,6 +105,7 @@ Choose your path: ## Blogs +- [A One-Stop Code-Free Model Reinforcement Learning and Deployment Platform based on LLaMA-Factory and EasyR1](https://aws.amazon.com/cn/blogs/china/building-llm-model-hub-based-on-llamafactory-and-easyr1/) (Chinese) - [Fine-tune Qwen2.5-VL for Autonomous Driving using LLaMA-Factory](https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory) (Chinese) - [How Apoidea Group enhances visual information extraction from banking documents with multimodal models using LLaMA-Factory on Amazon SageMaker HyperPod](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/) (English) - [Easy Dataset × LLaMA Factory: Enabling LLMs to Efficiently Learn Domain Knowledge](https://buaa-act.feishu.cn/wiki/GVzlwYcRFiR8OLkHbL6cQpYin7g) (English) diff --git a/Model/LLaMA-Factory/README_zh.md b/Model/LLaMA-Factory/README_zh.md index 216d9b6ce0ffbcff8aaaf8dd34e9f5b3e1301abe..05f75244697a75f6691cb121612bc60f76e1ca92 100644 --- a/Model/LLaMA-Factory/README_zh.md +++ b/Model/LLaMA-Factory/README_zh.md @@ -5,7 +5,7 @@ [![GitHub contributors](https://img.shields.io/github/contributors/hiyouga/LLaMA-Factory?color=orange)](https://github.com/hiyouga/LLaMA-Factory/graphs/contributors) [![GitHub workflow](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml/badge.svg)](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml) [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/) -[![Citation](https://img.shields.io/badge/citation-561-green)](https://scholar.google.com/scholar?cites=12620864006390196564) +[![Citation](https://img.shields.io/badge/citation-614-green)](https://scholar.google.com/scholar?cites=12620864006390196564) [![Docker Pulls](https://img.shields.io/docker/pulls/hiyouga/llamafactory)](https://hub.docker.com/r/hiyouga/llamafactory/tags) [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai) @@ -41,7 +41,7 @@ -👋 加入我们的[微信群](assets/wechat.jpg)、[NPU 用户群](assets/wechat_npu.jpg)或 [Alaya NeW 算力优惠群](assets/wechat_alaya.png)。 +👋 加入我们的[微信群](assets/wechat.jpg)、[NPU 用户群](assets/wechat_npu.jpg)或 [九章智算云算力优惠群](assets/wechat_alaya.png)。 \[ [English](README.md) | 中文 \] @@ -57,7 +57,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc - **Colab(免费)**:https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing - **本地机器**:请见[如何使用](#如何使用) - **PAI-DSW(免费试用)**:https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory -- **Alaya NeW(算力优惠活动)**:https://docs.alayanew.com/docs/documents/newActivities/llamafactory/?utm_source=LLaMA-Factory +- **九章智算云(算力优惠活动)**:https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory > [!NOTE] > 除上述链接以外的其他网站均为未经许可的第三方网站,请小心甄别。 @@ -107,6 +107,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc ## 官方博客 +- [基于 LLaMA-Factory 和 EasyR1 打造一站式无代码大模型强化学习和部署平台 LLM Model Hub](https://aws.amazon.com/cn/blogs/china/building-llm-model-hub-based-on-llamafactory-and-easyr1/)(中文) - [使用 LLaMA-Factory 微调 Qwen2.5-VL 实现自动驾驶场景微调](https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory)(中文) - [通过亚马逊 SageMaker HyperPod 上的 LLaMA-Factory 增强多模态模型银行文档的视觉信息提取](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/)(英文) - [Easy Dataset × LLaMA Factory: 让大模型高效学习领域知识](https://buaa-act.feishu.cn/wiki/KY9xwTGs1iqHrRkjXBwcZP9WnL9)(中文) diff --git a/Model/LLaMA-Factory/assets/wechat.jpg b/Model/LLaMA-Factory/assets/wechat.jpg index 40ec32c3a092f95a215cf61f5404ee8610884053..5a0f28f5840456da055e901a8712c21dd9b60845 100644 --- a/Model/LLaMA-Factory/assets/wechat.jpg +++ b/Model/LLaMA-Factory/assets/wechat.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cf89f6586b461d5fb37bd915730b03e021be8355eb0c903e7dd6c7de2de1bdb -size 172853 +oid sha256:90db00d9ffdfa2b364b61581c30c409100b8a3e8e25066b3a3217f5710d024eb +size 171500 diff --git a/Model/LLaMA-Factory/assets/wechat_npu.jpg b/Model/LLaMA-Factory/assets/wechat_npu.jpg index 5c389ec141e927db74b3ed6293416167312f1233..b34e89f293cdb632ab4c3e54d01a8dfa9d7f335a 100644 --- a/Model/LLaMA-Factory/assets/wechat_npu.jpg +++ b/Model/LLaMA-Factory/assets/wechat_npu.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd9a9d1fee3541605ff6e493e3331be2b02d220b874e6e830256246521e94517 -size 170889 +oid sha256:8241933348dc7fd5863541aa7471e67a1164bb20d021c0a45af4177d40ab71b7 +size 172107 diff --git a/Model/LLaMA-Factory/examples/train_lora/QA.yaml b/Model/LLaMA-Factory/examples/train_lora/QA.yaml index 86176228763dd78cafef8009b8893740bb82fdef..6dadf374fa3b5f9dc3d3475147163cc47f33a51c 100644 --- a/Model/LLaMA-Factory/examples/train_lora/QA.yaml +++ b/Model/LLaMA-Factory/examples/train_lora/QA.yaml @@ -1,6 +1,6 @@ ### model -model_name_or_path: Qwen/Qwen3-0.6B +model_name_or_path: Qwen/Qwen2.5-1.5B trust_remote_code: true ### method @@ -14,39 +14,39 @@ lora_target: all dataset: QAtrain eval_dataset: QAval template: qwen -cutoff_len: 4096 +cutoff_len: 2048 # max_samples: 50 overwrite_cache: true -preprocessing_num_workers: 32 +preprocessing_num_workers: 8 ### output -resume_from_checkpoint: False -output_dir: "/content/drive/MyDrive/youtube-resources" +resume_from_checkpoint: /kaggle/working/Model/last-checkpoint +output_dir: /kaggle/working/ logging_steps: 50 -save_steps: 500 +save_steps: 200 plot_loss: true overwrite_output_dir: true ### train per_device_train_batch_size: 1 -gradient_accumulation_steps: 4 +gradient_accumulation_steps: 16 learning_rate: 1.0e-4 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true # full ddp_timeout: 180000000 - +torch_compile: false ### eval # val_size: 0.1 per_device_eval_batch_size: 1 -eval_strategy: epoch -# eval_steps: 500 +eval_strategy: steps +eval_steps: 200 report_to: wandb run_name: Qwennn push_to_hub: true -export_hub_model_id: "Youssef/QWEN_Arabic_Q&A" +export_hub_model_id: Youssef/QWEN_Arabic_Q&A hub_private_repo: true hub_strategy: checkpoint diff --git a/Model/LLaMA-Factory/setup.py b/Model/LLaMA-Factory/setup.py index 8d3607bc50bf0964d1580950179d6c51de4b038d..11235be7f41f0d3ec92cc295d3b2703b14c7272d 100644 --- a/Model/LLaMA-Factory/setup.py +++ b/Model/LLaMA-Factory/setup.py @@ -52,7 +52,7 @@ extra_require = { "eetq": ["eetq"], "gptq": ["optimum>=1.24.0", "gptqmodel>=2.0.0"], "aqlm": ["aqlm[gpu]>=1.1.0"], - "vllm": ["vllm>=0.4.3,<=0.8.6"], + "vllm": ["vllm>=0.4.3,<=0.9.1"], "sglang": ["sglang[srt]>=0.4.5", "transformers==4.51.1"], "galore": ["galore-torch"], "apollo": ["apollo-torch"], diff --git a/Model/LLaMA-Factory/src/llamafactory.egg-info/PKG-INFO b/Model/LLaMA-Factory/src/llamafactory.egg-info/PKG-INFO index d980498d89c228141056fc8f449246d3f539fa3c..196cc79bba9c8dbd817bb8b2a8e074591b7d88c5 100644 --- a/Model/LLaMA-Factory/src/llamafactory.egg-info/PKG-INFO +++ b/Model/LLaMA-Factory/src/llamafactory.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: llamafactory -Version: 0.9.3.dev0 +Version: 0.9.4.dev0 Summary: Unified Efficient Fine-Tuning of 100+ LLMs Home-page: https://github.com/hiyouga/LLaMA-Factory Author: hiyouga @@ -76,7 +76,7 @@ Requires-Dist: gptqmodel>=2.0.0; extra == "gptq" Provides-Extra: aqlm Requires-Dist: aqlm[gpu]>=1.1.0; extra == "aqlm" Provides-Extra: vllm -Requires-Dist: vllm<=0.8.6,>=0.4.3; extra == "vllm" +Requires-Dist: vllm<=0.9.1,>=0.4.3; extra == "vllm" Provides-Extra: sglang Requires-Dist: sglang[srt]>=0.4.5; extra == "sglang" Requires-Dist: transformers==4.51.1; extra == "sglang" @@ -129,7 +129,7 @@ Dynamic: summary [![GitHub contributors](https://img.shields.io/github/contributors/hiyouga/LLaMA-Factory?color=orange)](https://github.com/hiyouga/LLaMA-Factory/graphs/contributors) [![GitHub workflow](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml/badge.svg)](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml) [![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/) -[![Citation](https://img.shields.io/badge/citation-561-green)](https://scholar.google.com/scholar?cites=12620864006390196564) +[![Citation](https://img.shields.io/badge/citation-614-green)](https://scholar.google.com/scholar?cites=12620864006390196564) [![Docker Pulls](https://img.shields.io/docker/pulls/hiyouga/llamafactory)](https://hub.docker.com/r/hiyouga/llamafactory/tags) [![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai) @@ -179,7 +179,7 @@ Choose your path: - **Colab (free)**: https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing - **Local machine**: Please refer to [usage](#getting-started) - **PAI-DSW (free trial)**: https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory -- **Alaya NeW (cloud GPU deal)**: https://docs.alayanew.com/docs/documents/newActivities/llamafactory/?utm_source=LLaMA-Factory +- **Alaya NeW (cloud GPU deal)**: https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory > [!NOTE] > Except for the above links, all other websites are unauthorized third-party websites. Please carefully use them. @@ -229,6 +229,7 @@ Choose your path: ## Blogs +- [A One-Stop Code-Free Model Reinforcement Learning and Deployment Platform based on LLaMA-Factory and EasyR1](https://aws.amazon.com/cn/blogs/china/building-llm-model-hub-based-on-llamafactory-and-easyr1/) (Chinese) - [Fine-tune Qwen2.5-VL for Autonomous Driving using LLaMA-Factory](https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory) (Chinese) - [How Apoidea Group enhances visual information extraction from banking documents with multimodal models using LLaMA-Factory on Amazon SageMaker HyperPod](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/) (English) - [Easy Dataset × LLaMA Factory: Enabling LLMs to Efficiently Learn Domain Knowledge](https://buaa-act.feishu.cn/wiki/GVzlwYcRFiR8OLkHbL6cQpYin7g) (English) diff --git a/Model/LLaMA-Factory/src/llamafactory.egg-info/requires.txt b/Model/LLaMA-Factory/src/llamafactory.egg-info/requires.txt index 6022598325da0430260ab4403cc22f43b0f3b724..a3bf651675144c3ca0a8ce70032fb338075555c3 100644 --- a/Model/LLaMA-Factory/src/llamafactory.egg-info/requires.txt +++ b/Model/LLaMA-Factory/src/llamafactory.egg-info/requires.txt @@ -108,4 +108,4 @@ torch-npu==2.4.0.post2 decorator [vllm] -vllm<=0.8.6,>=0.4.3 +vllm<=0.9.1,>=0.4.3 diff --git a/Model/LLaMA-Factory/src/llamafactory/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/__pycache__/__init__.cpython-311.pyc index 9b5ed032f2f6bb0d1e17f74fae51df00a59787d0..0079644ad729d251814c9d472d47af0139931c52 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc index 7e8d629e4e6d1ced3632a99670b7717da497efb5..e18c069708f67f5d0c8c71a180aec7c02790835b 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc index 56727de32a697116fcd45b6aee083674f6dec80f..f9a173951329196acbe1e40b87ab4654996ac2b2 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/collator.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc index 80e2ab0a954c512152949fecc6abf0f017c96c33..10480e4ff0aab962c3ffe4d2b559ce9019b12a89 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/converter.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc index 132af953c0d3f4343b232d917d77f5f61d835db3..2a5c95819449c82f670a2851c6ea77f168088e3c 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/data_utils.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc index 76c54e0c709dc83f2959605102d07f942f115aba..25f16b76bfb70299ef5b9af721d88d2d1469fb88 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/formatter.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc index 46f7fa1bcb7faf51fbfc91a137a5c60430773dac..8869c8af0da3f7bc30a5cafefbef76381e57c988 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/loader.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/mm_plugin.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/mm_plugin.cpython-311.pyc index 6dfc247276661d6df59fa244d18cff1a67d20173..aed3e260c20e8d52a2436f7ea5252a947d9a4d33 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/mm_plugin.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/mm_plugin.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc index 75aa0f648785fe187c58a2333897726d2fdaf98f..aa8a24b780d38b6768e316f8d0db3a9053776143 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/parser.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/template.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/template.cpython-311.pyc index c3c52f2b8a9eeedb7739e6c740209afaffce3d5b..83188dc52c47af0ab8412f596c7114981aff3df1 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/template.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/template.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc index a5605a7f40ccf070afe93ff701d907f5c5cc2fe1..9b74d2621c539a1b6f5636ad26509136a0101594 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/__pycache__/tool_utils.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/collator.py b/Model/LLaMA-Factory/src/llamafactory/data/collator.py index 3fb08f4b9d6b934942a4bb060ab0c13593a3feb0..b749aaef73d76e7515a6ae13c3dbd1e3d0bbfebe 100644 --- a/Model/LLaMA-Factory/src/llamafactory/data/collator.py +++ b/Model/LLaMA-Factory/src/llamafactory/data/collator.py @@ -21,6 +21,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional import numpy as np import torch import torch.nn.functional as F +from peft import PeftModel from transformers import DataCollatorForSeq2Seq from ..extras.constants import AUDIO_PLACEHOLDER, IGNORE_INDEX, IMAGE_PLACEHOLDER @@ -94,6 +95,16 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): if self.template is None: raise ValueError("Template is required for MultiModalDataCollator.") + if isinstance(self.model, PeftModel): + self.model = self.model.base_model.model + + if self.model is not None and hasattr(self.model, "get_rope_index"): # for qwen2vl mrope + self.get_rope_func = self.model.get_rope_index # transformers < 4.52.0 or qwen2.5 omni + elif self.model is not None and hasattr(self.model, "model") and hasattr(self.model.model, "get_rope_index"): + self.get_rope_func = self.model.model.get_rope_index # transformers >= 4.52.0 + else: + self.get_rope_func = None + def __call__(self, features: list[dict[str, Any]]) -> dict[str, "torch.Tensor"]: batch_images, batch_videos, batch_audios = [], [], [] batch_imglens, batch_vidlens, batch_audlens, batch_input_ids = [], [], [], [] @@ -171,7 +182,7 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): features: dict[str, torch.Tensor] = super().__call__(features) - if self.model is not None and hasattr(self.model, "get_rope_index"): # for qwen2vl mrope + if self.get_rope_func is not None: rope_index_kwargs = { "input_ids": features["input_ids"], "image_grid_thw": mm_inputs.get("image_grid_thw"), @@ -180,27 +191,29 @@ class MultiModalDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): } if "second_per_grid_ts" in mm_inputs: # for qwen2vl rope_index_kwargs["second_per_grid_ts"] = mm_inputs.get("second_per_grid_ts") - if "video_second_per_grid" in mm_inputs: # for qwen2omni + elif "video_second_per_grid" in mm_inputs: # for qwen2.5 omni rope_index_kwargs["second_per_grids"] = mm_inputs.get("video_second_per_grid") - if getattr(self.model.config, "model_type", None) == "qwen2_5_omni_thinker": # for qwen2omni + if getattr(self.model.config, "model_type", None) == "qwen2_5_omni_thinker": # for qwen2.5 omni rope_index_kwargs["use_audio_in_video"] = getattr(self.processor, "use_audio_in_video", False) feature_attention_mask = mm_inputs.get("feature_attention_mask", None) - if feature_attention_mask is not None: - audio_feature_lengths = torch.sum( - feature_attention_mask, dim=1 - ) # FIXME need to get video image lengths + if feature_attention_mask is not None: # FIXME: need to get video image lengths + audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) rope_index_kwargs["audio_seqlens"] = audio_feature_lengths # prepare for input - delta0 = (1 - rope_index_kwargs["attention_mask"]).sum(dim=-1).unsqueeze(1) - # avoid conflict - new_position_ids, rope_deltas = self.model.get_rope_index(**rope_index_kwargs) - features["position_ids"], features["rope_deltas"] = ( - new_position_ids.clone(), - rope_deltas - delta0, - ) # avoid inplace operation FIXME + features["position_ids"], rope_deltas = self.get_rope_func(**rope_index_kwargs) + features["rope_deltas"] = rope_deltas - (1 - rope_index_kwargs["attention_mask"]).sum( + dim=-1 + ).unsqueeze(-1) else: # for qwen2vl - features["position_ids"], features["rope_deltas"] = self.model.get_rope_index(**rope_index_kwargs) + features["position_ids"], features["rope_deltas"] = self.get_rope_func(**rope_index_kwargs) + + if ( + self.model is not None + and getattr(self.model.config, "model_type", None) in ["qwen2_vl", "qwen2_5_vl", "qwen2_5_omni_thinker"] + and ("position_ids" not in features or features["position_ids"].dim() != 3) + ): + raise ValueError("Qwen2-VL/Qwen2.5-Omni model requires 3D position ids for mrope.") if "cross_attention_mask" in mm_inputs: # for mllama inputs when pad_to_multiple_of is enabled cross_attention_mask = mm_inputs.pop("cross_attention_mask") diff --git a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc index 09cd7804809d221890749cf7945c7b7f6bf91d5e..78d6c81bb18e72cf5451fea51854f35d24206940 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc index f12d4b60406fe4dd8e74603880435599e0d1232f..d4e93eefdc4f0b2db959aaad58e3e11c63851a2d 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/feedback.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc index b063a86e5d4382108f8c235b5ffc1f743763d5f0..4d82be171aa75122ee4cbfba3bfc150f50deddc5 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pairwise.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc index 0ad70712bb06596dda575fb58ea491b0ce17037d..7c7c7577b68d6b432f19fe810ed343580734c1ac 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/pretrain.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc index 02b60d6976c6a58766fc785a52c7a12ae8909d8c..508a1435e34b1ab8734c723956bbcdae1abd16f0 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/processor_utils.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc index 45c06c0a8febf9809f0167cb19d620644e0b1314..480210f27ef861b74c892eac399e50bfa1fb0589 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/supervised.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc index 258de36ea5f7e3e525edaa27214d2c938d48c45b..7eb9fd84b0e883bdf195da6ede79a6d245576310 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/data/processor/__pycache__/unsupervised.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc index cca47e86354672017a018ffca69d15695b325fb2..55e44ce3551d4fea85bb2002d04010f0883aa2af 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc index ca05b33c805d9812889b6cc6d620fe69a575eb16..1e978829aa689661b7ef97a60c22c3ff0636b9be 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/constants.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc index 04f82325f44227b36e308595d4f2666b6e961983..8f43590514a7c889847977e246ce8877f65dd3b0 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/env.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc index c703e2c9a9f420f5018a391a4503e92b2440170c..1af8a3327ab76ce5e2e6c3ffcda236660e0a3834 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/logging.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc index a447b7fb89337bbaa0f4229e047b74d82bbc4b1d..e5d1def642185baa71e319c275894a9a026d3042 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/misc.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc index e2ebc74021410649269cb778dcd0a9f2067e3bc2..5097d0b28ce36f21dea02d2a0defa8462523bc3b 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/packages.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc index eef5f91a98821dbc98181e9b08a41aa9ea68231d..78d1693d283dd4cefa9f84558cd9d36e68610920 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/extras/__pycache__/ploting.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/extras/constants.py b/Model/LLaMA-Factory/src/llamafactory/extras/constants.py index da19ebb7a9005d82aea3cd296c6ae8e25660ee3d..f582d1f0f462715e98990522db359b787ccc34e8 100644 --- a/Model/LLaMA-Factory/src/llamafactory/extras/constants.py +++ b/Model/LLaMA-Factory/src/llamafactory/extras/constants.py @@ -1609,13 +1609,13 @@ register_model_group( register_model_group( models={ - "Mistral-Small-24B-Base-2503": { - DownloadSource.DEFAULT: "mistralai/Mistral-Small-24B-Base-2503", - DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-24B-Base-2503", + "Mistral-Small-3.1-24B-Base": { + DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.1-24B-Base-2503", + DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.1-24B-Base-2503", }, - "Mistral-Small-24B-Instruct-2503": { - DownloadSource.DEFAULT: "mistralai/Mistral-Small-24B-Instruct-2503", - DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-24B-Instruct-2503", + "Mistral-Small-3.1-24B-Instruct": { + DownloadSource.DEFAULT: "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + DownloadSource.MODELSCOPE: "mistralai/Mistral-Small-3.1-24B-Instruct-2503", }, }, template="mistral_small", diff --git a/Model/LLaMA-Factory/src/llamafactory/extras/env.py b/Model/LLaMA-Factory/src/llamafactory/extras/env.py index c4872ea47a8be38a4339f691bf091b6f179e4846..6b676a61ab670a1bdd4a77e11978150bd19f83fa 100644 --- a/Model/LLaMA-Factory/src/llamafactory/extras/env.py +++ b/Model/LLaMA-Factory/src/llamafactory/extras/env.py @@ -27,7 +27,7 @@ import trl from transformers.utils import is_torch_cuda_available, is_torch_npu_available -VERSION = "0.9.3.dev0" +VERSION = "0.9.4.dev0" def print_env() -> None: diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc index 40369a6a3506e85ff4d7e163176cab5beb1d9efa..7db1eae5f6476490bb5e8cfc5f7968450d320f83 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc index f7ab975052bb5f9f6a591363d719b1b6f397f87a..d14298da60cb86f83b946856ef2063eb895e48ef 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/data_args.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc index a46070ee90707e10586bca73fefe5af69e4bf771..16401a053a0369ba9a8648ae9668e5d0f5e82f57 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/evaluation_args.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc index 611586103a5d1bc8b59702b7198f3b4b30e05904..d25caf8a6e0ccc9cdd53100af5a3afd5693c0feb 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/finetuning_args.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc index 90e6e87be3a39637040c088aca6de791956605e2..1bab5829aa31b8c1c65c28322a53bab1ba4afd83 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/generating_args.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc index 06a281021684b2b92e091a1798418a172a3c5119..a46f788cd5dff57df3bb508d2c28327312bd8918 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/model_args.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc index cc9f61e55bda8395e4caa9baee652558289975ce..1c041eca5167eea8ea9054a33bce0021f1688c9c 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/parser.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc index d51e731b85d2fe21a5bb788039f458ebbbe0aff1..3d8f438db03ca43cab4b8ca74c3cc5bb3a4bbc45 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/hparams/__pycache__/training_args.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/finetuning_args.py b/Model/LLaMA-Factory/src/llamafactory/hparams/finetuning_args.py index 6217015d8e285c72ee3ec0063ba0430c8869979f..43596864ca050e4013121dc0852eb8f4d144d9bf 100644 --- a/Model/LLaMA-Factory/src/llamafactory/hparams/finetuning_args.py +++ b/Model/LLaMA-Factory/src/llamafactory/hparams/finetuning_args.py @@ -202,6 +202,15 @@ class RLHFArguments: default="lora", metadata={"help": "The type of the reward model in PPO training. Lora model only supports lora training."}, ) + ld_alpha: Optional[float] = field( + default=None, + metadata={ + "help": ( + "Alpha parameter from the LD-DPO paper, which controls the weighting of" + " the verbose token log-probabilities in responses." + ) + }, + ) @dataclass diff --git a/Model/LLaMA-Factory/src/llamafactory/hparams/parser.py b/Model/LLaMA-Factory/src/llamafactory/hparams/parser.py index 7b0c04767f24bfb519fc14fceaf33f0527734c7a..91cee7293572a7a4b53684608da4026f3a3652b9 100644 --- a/Model/LLaMA-Factory/src/llamafactory/hparams/parser.py +++ b/Model/LLaMA-Factory/src/llamafactory/hparams/parser.py @@ -148,7 +148,7 @@ def _check_extra_dependencies( check_version("mixture-of-depth>=1.1.6", mandatory=True) if model_args.infer_backend == EngineName.VLLM: - check_version("vllm>=0.4.3,<=0.8.6") + check_version("vllm>=0.4.3,<=0.9.1") check_version("vllm", mandatory=True) elif model_args.infer_backend == EngineName.SGLANG: check_version("sglang>=0.4.5") @@ -169,10 +169,15 @@ def _check_extra_dependencies( if finetuning_args.plot_loss: check_version("matplotlib", mandatory=True) - if training_args is not None and training_args.predict_with_generate: - check_version("jieba", mandatory=True) - check_version("nltk", mandatory=True) - check_version("rouge_chinese", mandatory=True) + if training_args is not None: + if training_args.deepspeed: + # pin deepspeed version < 0.17 because of https://github.com/deepspeedai/DeepSpeed/issues/7347 + check_version("deepspeed>=0.10.0,<=0.16.9", mandatory=True) + + if training_args.predict_with_generate: + check_version("jieba", mandatory=True) + check_version("nltk", mandatory=True) + check_version("rouge_chinese", mandatory=True) def _parse_train_args(args: Optional[Union[dict[str, Any], list[str]]] = None) -> _TRAIN_CLS: diff --git a/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc index c4fc6fab4f541fb51ac13a85b60e7c4056e76ed5..366ef09e4fc5a918201a65c7e3dda7eedc89102d 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc index 6ca972c77a56c9f647e0c4ab7b26b715dce16325..85fecc6e387c2fabd67ed3e6c1792c320f2af83f 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/adapter.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc index a3ebf0e83b11647d15f9684805760d34da9012e3..18dcdfaad729bbad9e4e474d18c471d42bf14bb8 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/loader.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc index 4a44bcac6417199abb6d9d14060ba62707b5531e..64e922fea36c20522b7bf9b63c772e73d4e0f2ec 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/__pycache__/patcher.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/loader.py b/Model/LLaMA-Factory/src/llamafactory/model/loader.py index cbcc6b288805974a921ec792b546b20f5b6c0b47..7ed4230a8145dcd95d208479b21edf10a90a16de 100644 --- a/Model/LLaMA-Factory/src/llamafactory/model/loader.py +++ b/Model/LLaMA-Factory/src/llamafactory/model/loader.py @@ -86,10 +86,10 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule": padding_side="right", **init_kwargs, ) - except ValueError: # try the fast one + except ValueError: # try another one tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, - use_fast=True, + use_fast=not model_args.use_fast_tokenizer, padding_side="right", **init_kwargs, ) @@ -97,12 +97,23 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule": raise OSError("Failed to load tokenizer.") from e patch_tokenizer(tokenizer, model_args) + try: - processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs) - patch_processor(processor, tokenizer, model_args) + processor = AutoProcessor.from_pretrained( + model_args.model_name_or_path, + use_fast=model_args.use_fast_tokenizer, + **init_kwargs, + ) + except ValueError: # try another one + processor = AutoProcessor.from_pretrained( + model_args.model_name_or_path, + use_fast=not model_args.use_fast_tokenizer, + **init_kwargs, + ) except Exception as e: - logger.info_rank0(f"Failed to load processor: {e}.") - processor = None + raise OSError("Failed to load processor.") from e + + patch_processor(processor, tokenizer, model_args) # Avoid load tokenizer, see: # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/auto/processing_auto.py#L324 diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc index 6fe9f42bf278762b77763af376174701cb38ab8f..cd18482d1203b12ac5fdf62c7156f3ad6ae34c9b 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc index 0c6c81e352c183ca6b212da8f4ebb4a9318e0b54..cb347857ac2ff6e95c9a8f2ac5fbce9f53c9ba30 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/attention.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc index 7ce53d61662d9daca2ed8e0cc21ee6e257e229ec..4eb58b86afca79384a318d7e5438beb1a30efc43 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/checkpointing.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc index 8c86e47222d82e99786ae5ddeb63e437ccfcc938..dc37b0129f56be95cab5825eedfdbb796e510f5b 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/embedding.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc index 3cd935f7570419e6207c662b0ac0190a1edada72..a7c143c5fa729cbfde1190f24e6637f80e90a66e 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/kv_cache.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc index 7c0d19424a4d30485e916cc8c104e5cd2b9dcd03..bf12fd68c0ec38c7ed9b74d25148be7fe7e33bf1 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/liger_kernel.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc index d5948f7da33a24ccd9b8e499f4c412a7c6bbd592..d323710cf235f975f674b42b91bf8006dab30e30 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/longlora.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc index 440dd0acc631bf41ab7b3019ca01e4dc260a68b8..6833531f7fc405cbac4064e3b6708ef0830e2584 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/misc.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc index 330b450fb294075f365f3a270f7cb87251eb581b..3660f04810e5b676c61a9f8800f9b44a6e661665 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/mod.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc index 44eddfa7d25a20bffb367cba9500256f311cfd2d..a2380d033789d60eb371e8f86554ca95a27705dd 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/moe.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc index 50447783102daca45ef24c601a4fb80547b98d42..4718e09e15404971b832c24ace6b9b0f020c4f24 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/packing.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc index 231997315d6d0c201301e77e7e98c34c3c1d3e1e..ee7838943fea9b2232c73b8aa8b9216c7eb5e7e2 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/quantization.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc index 97b313df0e0cf84a0a388a0e7d956efb5cf61f6e..e8103754c13bf84b5afc16f850e77f594cff6619 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/rope.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc index cba08e2f967435fa56f14c9e12ee65e39292d007..1bc649eee463a422a94cb116325ceedcbda2003a 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/unsloth.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc index 4e0344bf275a5d9ef16476874ada1d94f7bfe354..80dc712f411a7df0022e83be1801e5cfc8191b85 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/valuehead.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc index ba1b6ad2e790ea9e8f04ddcb6a1d7c9d0b733444..11956dd0ccf8bba74492e2f4e0b9b97ae3736969 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/__pycache__/visual.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/liger_kernel.py b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/liger_kernel.py index 3f467752d976c672ae4866945edf9235b683f251..2cf30b11af863381d0d490edd811934270fa8f12 100644 --- a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/liger_kernel.py +++ b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/liger_kernel.py @@ -73,6 +73,8 @@ def apply_liger_kernel( from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl as apply_liger_kernel elif model_type == "qwen3": from liger_kernel.transformers import apply_liger_kernel_to_qwen3 as apply_liger_kernel + elif model_type == "qwen3_moe": + from liger_kernel.transformers import apply_liger_kernel_to_qwen3_moe as apply_liger_kernel else: logger.warning_rank0("Current model does not support liger kernel.") return diff --git a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/visual.py b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/visual.py index 9d4e535a2d7e3be3e16956689f7255e2624c93a1..ba2bf5c9856fe52cb86b1a5c9867602c72e888b7 100644 --- a/Model/LLaMA-Factory/src/llamafactory/model/model_utils/visual.py +++ b/Model/LLaMA-Factory/src/llamafactory/model/model_utils/visual.py @@ -76,7 +76,7 @@ def _register_composite_model( model_type=model_type, projector_key=projector_key or "multi_modal_projector", vision_model_keys=vision_model_keys or ["vision_tower"], - language_model_keys=language_model_keys or ["language_model"], + language_model_keys=language_model_keys or ["language_model", "lm_head"], lora_conflict_keys=lora_conflict_keys or [], ) @@ -200,12 +200,12 @@ def patch_target_modules( _register_composite_model( - model_type="internvl", + model_type="gemma3", ) _register_composite_model( - model_type="gemma3", + model_type="internvl", ) @@ -246,14 +246,8 @@ _register_composite_model( lora_conflict_keys=["audio_projection_layer"], ) - -_register_composite_model( - model_type="paligemma", -) - - _register_composite_model( - model_type="video_llava", + model_type="mistral3", ) @@ -264,7 +258,7 @@ _register_composite_model( _register_composite_model( - model_type="mistral3", + model_type="paligemma", ) @@ -287,7 +281,9 @@ _register_composite_model( model_type="qwen2_vl", projector_key="visual.merger", vision_model_keys=["visual.patch_embed", "visual.blocks"], - language_model_keys=["language_model"] if is_transformers_version_greater_than("4.52.0") else ["model", "lm_head"], + language_model_keys=["language_model", "lm_head"] + if is_transformers_version_greater_than("4.52.0") + else ["model", "lm_head"], lora_conflict_keys=["patch_embed"], ) @@ -296,6 +292,13 @@ _register_composite_model( model_type="qwen2_5_vl", projector_key="visual.merger", vision_model_keys=["visual.patch_embed", "visual.blocks"], - language_model_keys=["language_model"] if is_transformers_version_greater_than("4.52.0") else ["model", "lm_head"], + language_model_keys=["language_model", "lm_head"] + if is_transformers_version_greater_than("4.52.0") + else ["model", "lm_head"], lora_conflict_keys=["patch_embed"], ) + + +_register_composite_model( + model_type="video_llava", +) diff --git a/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc index 8b774ef43cacc3723804c08d951a8ed7156cb0fd..f1e248f0492ab564979080bd34d4e2cf19ea2474 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc index ebdfb50b32281c4c961917e3b58d3692d605deca..290d7c4ca732170c285d7af86857f2003df879cb 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/callbacks.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc index ae20d5b4e6139a7c88d135d868c40a9d074289d8..2d6bd2c86d494399699cfde94eaf1e1903bfd089 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/trainer_utils.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc index e0afd738d3000d7ca0f5baacc142224372ae340a..ed32ca4d68eea9c5b3428d0d1febabc220b534a0 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/__pycache__/tuner.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc index 6771b5aaba084e204f2bf74886252bdd6b4ba759..d753b9a4de7eea76e2795be777803e8797705a60 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/trainer.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/trainer.cpython-311.pyc index 9130e1454f655bd74c32ec146e3d0002166c662a..587315b855b419fad9dc2aa8e40740fecc2a7e78 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/trainer.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/trainer.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc index 1897e4af315ddc7baf90c07ad43eb77921489bdb..992a898f44bc6d945616eda65ebab0c09c8fa928 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/dpo/__pycache__/workflow.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/dpo/trainer.py b/Model/LLaMA-Factory/src/llamafactory/train/dpo/trainer.py index 2539127ce90c93f8a5b77566fa3ded863424e3af..63822e88fe45a56bc38cc7bf1949f4b2ed121a03 100644 --- a/Model/LLaMA-Factory/src/llamafactory/train/dpo/trainer.py +++ b/Model/LLaMA-Factory/src/llamafactory/train/dpo/trainer.py @@ -80,6 +80,7 @@ class CustomDPOTrainer(DPOTrainer): self.ftx_gamma = finetuning_args.pref_ftx self.label_smoothing = finetuning_args.dpo_label_smoothing self.simpo_gamma = finetuning_args.simpo_gamma + self.ld_alpha = finetuning_args.ld_alpha Trainer.__init__(self, model=model, **kwargs) self.model_accepts_loss_kwargs = False # overwrite trainer's default behavior @@ -177,7 +178,7 @@ class CustomDPOTrainer(DPOTrainer): @override def concatenated_forward( - self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"] + self, model: "PreTrainedModel", batch: dict[str, "torch.Tensor"], is_ref_model: bool = False ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]: r"""Compute the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO. @@ -187,7 +188,9 @@ class CustomDPOTrainer(DPOTrainer): batch = nested_detach(batch, clone=True) # avoid error all_logits: torch.Tensor = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32) - all_logps, valid_length = get_batch_logps(logits=all_logits, labels=batch["labels"]) + all_logps, valid_length = get_batch_logps( + logits=all_logits, labels=batch["labels"], ld_alpha=(self.ld_alpha if not is_ref_model else None) + ) if self.loss_type in ["ipo", "orpo", "simpo"]: all_logps = all_logps / valid_length @@ -217,7 +220,9 @@ class CustomDPOTrainer(DPOTrainer): ref_context = nullcontext() with torch.no_grad(), ref_context: - reference_chosen_logps, reference_rejected_logps, *_ = self.concatenated_forward(ref_model, batch) + reference_chosen_logps, reference_rejected_logps, *_ = self.concatenated_forward( + ref_model, batch, is_ref_model=True + ) return reference_chosen_logps, reference_rejected_logps diff --git a/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc index f7e89428af16d7fe3229f50fd46056df14c5c1ae..7c09272257cb8b3a7f3b7ec1f41b7c6c1bcca43e 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc index 33c49f6308a6ca8a388298392fb6e5f3a3547d28..1e8fc514f0e50283075eb9800959ee06014e4f16 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/trainer.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc index c72e5d137abe428f18fde03724f633bac60fca2c..e59c7f7488f8ff5298960ff04c6ec77e924167a1 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/kto/__pycache__/workflow.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc index d0ab585a418da3865286dcd54fa1f09258d22016..21dff9835ea7d30f11a93a3b2756ab7ecf779b2a 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc index 8e5efb4eaee616990403dcec45107311b9a175b3..14be3537657d18e34aa8a84c883a42bdfc1fef7a 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/ppo_utils.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc index 674bd087b468e502389a0faaf36b31b13cf67367..5cb64fb3157e9965c605c4f92035a636e3221ff4 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/trainer.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc index 69f02745ea1117c6950f980904ff16ae8f43f6c8..c68121dc560210110b4e2306dabb968942fd9f68 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/ppo/__pycache__/workflow.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc index 2bc131aec5f4878672d0e8ca85ba0636283a24ab..f97fd31e55364fb8dd1eea343cadadea517f7450 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc index a34c4594dee4c70fc71fea8731f30c948a73dc7d..875b84279297d061efc4acb714063304ce85043b 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/trainer.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc index 87c251e6d77e041b803374053dfcf5c740f5e2b2..1a64340e5146a869a76f9492ebe5550ef249e0b0 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/pt/__pycache__/workflow.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc index 946baa1e0e4e95fd7db1e296f6265372039bb782..0dba75ea8f5a2605a66f9b2bff2013099e5972e2 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc index 30c00979ba7c96c0327f5cd2d15ecf7bc1a19dff..9999071083feb54a6cdf64c5a55a93b295d5f75e 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/metric.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc index 7b2abf754716ee97111dc4a80a1a588216fe8a54..bbeb9ef0a4e9e641fee455b001bbd1d96bb44c8b 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/trainer.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc index 78a409c3cb632404996c7099a4843e60112776ee..5a80f27dff7a4e4476094b5f83aa216e06672da8 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/rm/__pycache__/workflow.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc index 3afde4d60a3e982d993eb0f03eacbf8576e064fb..60f3e9c473cec66bff891a260cc70ecb1b0817cf 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/__init__.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc index a9634dec5ac9ef8490504fe9947c5a5789a1a171..a5610c8b2043bb0906c44d234653594b2ee9530b 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/metric.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc index 75f8f1c662ca983157415256e6f65d32389627cf..6731cf20c66bfa9ab48b54e6fa5b7a34d109ddc9 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/trainer.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc b/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc index 0eef18f9f1ab18bb9939f555306b198464b762fe..e036b23ab59bcb7e2f394045354efc7f2b6e863b 100644 Binary files a/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc and b/Model/LLaMA-Factory/src/llamafactory/train/sft/__pycache__/workflow.cpython-311.pyc differ diff --git a/Model/LLaMA-Factory/src/llamafactory/train/trainer_utils.py b/Model/LLaMA-Factory/src/llamafactory/train/trainer_utils.py index 873962209cc68a4e556441bfe30f5fa77999f559..7d35bbeb6b535a7f69d9eae2b5d576a14506cf92 100644 --- a/Model/LLaMA-Factory/src/llamafactory/train/trainer_utils.py +++ b/Model/LLaMA-Factory/src/llamafactory/train/trainer_utils.py @@ -585,7 +585,10 @@ def create_custom_scheduler( def get_batch_logps( - logits: "torch.Tensor", labels: "torch.Tensor", label_pad_token_id: int = IGNORE_INDEX + logits: "torch.Tensor", + labels: "torch.Tensor", + label_pad_token_id: int = IGNORE_INDEX, + ld_alpha: Optional[float] = None, ) -> tuple["torch.Tensor", "torch.Tensor"]: r"""Compute the log probabilities of the given labels under the given logits. @@ -602,7 +605,30 @@ def get_batch_logps( loss_mask = labels != label_pad_token_id labels[labels == label_pad_token_id] = 0 # dummy token per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2) - return (per_token_logps * loss_mask).sum(-1), loss_mask.sum(-1) + + valid_length = loss_mask.sum(-1) + if ld_alpha is not None: + num_examples = labels.shape[0] // 2 + chosen_lengths = valid_length[:num_examples] + rejected_lengths = valid_length[num_examples:] + min_lengths = torch.min(chosen_lengths, rejected_lengths) + start_positions = torch.argmax(loss_mask.int(), dim=1) + public_lengths = start_positions + torch.cat([min_lengths, min_lengths], dim=0) + + seq_len = labels.shape[-1] + position_ids = torch.arange(seq_len, device=per_token_logps.device).expand_as(per_token_logps) + + ld_mask = position_ids < public_lengths.unsqueeze(1) + front_mask = (ld_mask * loss_mask).float() + rear_mask = (~ld_mask * loss_mask).float() + + front_logps = (per_token_logps * front_mask).sum(-1) + rear_logps = (per_token_logps * rear_mask).sum(-1) + logps = front_logps + ld_alpha * rear_logps + else: + logps = (per_token_logps * loss_mask).sum(-1) + + return logps, valid_length def nested_detach( diff --git a/Model/LLaMA-Factory/tests/data/test_collator.py b/Model/LLaMA-Factory/tests/data/test_collator.py index a263c0f8bdafb3ed99b7a90f18fec3b4857b3cf2..657f280dfa13739fa077c8a39e33b82f610bb1f2 100644 --- a/Model/LLaMA-Factory/tests/data/test_collator.py +++ b/Model/LLaMA-Factory/tests/data/test_collator.py @@ -16,6 +16,7 @@ import os import torch from PIL import Image +from transformers import AutoConfig, AutoModelForVision2Seq from llamafactory.data import get_template_and_fix_tokenizer from llamafactory.data.collator import MultiModalDataCollatorForSeq2Seq, prepare_4d_attention_mask @@ -72,12 +73,17 @@ def test_base_collator(): def test_multimodal_collator(): model_args, data_args, *_ = get_infer_args( - {"model_name_or_path": "Qwen/Qwen2-VL-7B-Instruct", "template": "qwen2_vl"} + {"model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct", "template": "qwen2_vl"} ) tokenizer_module = load_tokenizer(model_args) template = get_template_and_fix_tokenizer(tokenizer_module["tokenizer"], data_args) + config = AutoConfig.from_pretrained(model_args.model_name_or_path) + with torch.device("meta"): + model = AutoModelForVision2Seq.from_config(config) + data_collator = MultiModalDataCollatorForSeq2Seq( template=template, + model=model, pad_to_multiple_of=4, label_pad_token_id=IGNORE_INDEX, **tokenizer_module, @@ -107,8 +113,15 @@ def test_multimodal_collator(): "labels": [ [0, 1, 2, 3, q, q, q, q, q, q, q, q], ], + "position_ids": [ + [[0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1]], + [[0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1]], + [[0, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1]], + ], + "rope_deltas": [[-8]], **tokenizer_module["processor"].image_processor(fake_image), } + assert batch_input.keys() == expected_input.keys() for k in batch_input.keys(): assert batch_input[k].eq(torch.tensor(expected_input[k])).all() @@ -150,3 +163,7 @@ def test_4d_attention_mask(): ) assert list(attention_mask_computed.size()) == [2, 1, 6, 6] assert torch.all(attention_mask_computed == attention_mask_expected) + + +if __name__ == "__main__": + test_multimodal_collator() diff --git a/Model/LLaMA-Factory/tests/version.txt b/Model/LLaMA-Factory/tests/version.txt index dae5ebba38198ce6cc0907d648d3065ffbdd9de4..0f1383aa404d913c8913d3d639459b227b45df2c 100644 --- a/Model/LLaMA-Factory/tests/version.txt +++ b/Model/LLaMA-Factory/tests/version.txt @@ -1,2 +1,2 @@ # change if test fails or cache is outdated -0.9.3.107 +0.9.3.108 diff --git a/Model/LLaMA-Factory/wandb/debug-internal.log b/Model/LLaMA-Factory/wandb/debug-internal.log index 19504c32253e59065e61a760f701a8aff06f85c2..d68dc2b105686882a2e9846d36d51b3aa2b97dfc 100644 --- a/Model/LLaMA-Factory/wandb/debug-internal.log +++ b/Model/LLaMA-Factory/wandb/debug-internal.log @@ -1,33 +1,7 @@ -{"time":"2025-06-11T01:51:58.116391554Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"/kaggle/working/LLaMA-Factory/wandb/run-20250611_015158-yi8lyobb/logs/debug-core.log"} -{"time":"2025-06-11T01:51:58.361852515Z","level":"INFO","msg":"created new stream","id":"yi8lyobb"} -{"time":"2025-06-11T01:51:58.361901495Z","level":"INFO","msg":"stream: started","id":"yi8lyobb"} -{"time":"2025-06-11T01:51:58.3619241Z","level":"INFO","msg":"writer: Do: started","stream_id":"yi8lyobb"} -{"time":"2025-06-11T01:51:58.361953859Z","level":"INFO","msg":"handler: started","stream_id":"yi8lyobb"} -{"time":"2025-06-11T01:51:58.362006812Z","level":"INFO","msg":"sender: started","stream_id":"yi8lyobb"} -{"time":"2025-06-11T01:51:58.660266243Z","level":"INFO","msg":"Starting system monitor"} -{"time":"2025-06-11T01:52:28.661253613Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:52:43.662275122Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:52:58.661310276Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:53:13.661956043Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:53:28.661923699Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:53:43.661740603Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:53:58.66160437Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:54:13.662232736Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:54:28.661528482Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:54:43.661369168Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:54:58.661606101Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:55:13.662309602Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:55:28.662482967Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:55:43.662100903Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:55:58.662338968Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:56:13.661495822Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:56:28.662316446Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:56:43.662194959Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:56:58.662150218Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:57:13.661599272Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:57:28.661431364Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:57:43.662302067Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:57:58.661441336Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:58:13.661575923Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:58:28.662143059Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} -{"time":"2025-06-11T01:58:43.661733399Z","level":"ERROR","msg":"monitor: error sampling metrics: open /proc/382/statm: no such file or directory\nopen /proc/382/stat: no such file or directory\nopen /proc/382/status: no such file or directory"} +{"time":"2025-06-18T02:04:45.169771821Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"/kaggle/working/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-core.log"} +{"time":"2025-06-18T02:04:45.427972883Z","level":"INFO","msg":"created new stream","id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.428081239Z","level":"INFO","msg":"stream: started","id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.428182138Z","level":"INFO","msg":"sender: started","stream_id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.428270431Z","level":"INFO","msg":"handler: started","stream_id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.428273846Z","level":"INFO","msg":"writer: Do: started","stream_id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.721935563Z","level":"INFO","msg":"Starting system monitor"} diff --git a/Model/LLaMA-Factory/wandb/debug.log b/Model/LLaMA-Factory/wandb/debug.log index 5a37ee327ab36756c00d899acf08d14dfffeac21..afdb722aae1e8a6304b2572da6e3ed067d2818bd 100644 --- a/Model/LLaMA-Factory/wandb/debug.log +++ b/Model/LLaMA-Factory/wandb/debug.log @@ -1,26 +1,26 @@ -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9 -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_setup.py:_flush():67] Configure stats pid to 382 -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_setup.py:_flush():67] Loading settings from /kaggle/working/LLaMA-Factory/wandb/settings -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_setup.py:_flush():67] Loading settings from environment variables -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_init.py:setup_run_log_directory():662] Logging user logs to /kaggle/working/LLaMA-Factory/wandb/run-20250611_015158-yi8lyobb/logs/debug.log -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to /kaggle/working/LLaMA-Factory/wandb/run-20250611_015158-yi8lyobb/logs/debug-internal.log -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_init.py:init():781] calling init triggers -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_init.py:init():786] wandb.init called with sweep_config: {} +2025-06-18 02:04:45,149 INFO MainThread:294 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9 +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Configure stats pid to 294 +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Loading settings from /kaggle/working/LLaMA-Factory/wandb/settings +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:setup_run_log_directory():662] Logging user logs to /kaggle/working/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug.log +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to /kaggle/working/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-internal.log +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():781] calling init triggers +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():786] wandb.init called with sweep_config: {} config: {'_wandb': {}} -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_init.py:init():809] starting backend -2025-06-11 01:51:58,101 INFO MainThread:382 [wandb_init.py:init():813] sending inform_init request -2025-06-11 01:51:58,112 INFO MainThread:382 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn -2025-06-11 01:51:58,112 INFO MainThread:382 [wandb_init.py:init():823] backend started and connected -2025-06-11 01:51:58,117 INFO MainThread:382 [wandb_init.py:init():915] updated telemetry -2025-06-11 01:51:58,134 INFO MainThread:382 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout -2025-06-11 01:51:58,652 INFO MainThread:382 [wandb_init.py:init():1014] starting run threads in backend -2025-06-11 01:51:59,267 INFO MainThread:382 [wandb_run.py:_console_start():2454] atexit reg -2025-06-11 01:51:59,267 INFO MainThread:382 [wandb_run.py:_redirect():2306] redirect: wrap_raw -2025-06-11 01:51:59,267 INFO MainThread:382 [wandb_run.py:_redirect():2371] Wrapping output streams. -2025-06-11 01:51:59,267 INFO MainThread:382 [wandb_run.py:_redirect():2394] Redirects installed. -2025-06-11 01:51:59,278 INFO MainThread:382 [wandb_init.py:init():1056] run started, returning control to user process -2025-06-11 01:51:59,280 INFO MainThread:382 [wandb_run.py:_config_callback():1327] config_cb None None {'peft_config': {'default': {'task_type': , 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'Qwen/Qwen3-0.6B', 'revision': None, 'inference_mode': False, 'r': 64, 'target_modules': {'v_proj', 'q_proj', 'gate_proj', 'o_proj', 'k_proj', 'up_proj', 'down_proj'}, 'exclude_modules': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 151936, 'max_position_embeddings': 40960, 'hidden_size': 1024, 'intermediate_size': 3072, 'num_hidden_layers': 28, 'num_attention_heads': 16, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 28, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen3ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151645, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen3-0.6B', '_attn_implementation_autoset': True, 'transformers_version': '4.51.3', 'model_type': 'qwen3', 'output_dir': '/content/drive/MyDrive/youtube-resources', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'epoch', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/content/drive/MyDrive/youtube-resources/runs/Jun11_01-50-33_9f49020d3272', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'Qwennn', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'tp_size': 0, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': False, 'hub_model_id': None, 'hub_strategy': 'checkpoint', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 4096, 'generation_num_beams': None, 'generation_config': None, 'ray_run_name': None, 'ray_storage_path': './saves', 'ray_storage_filesystem': None, 'ray_num_workers': 1, 'resources_per_worker': {'GPU': 1}, 'placement_strategy': 'PACK', 'ray_init_kwargs': None} -2025-06-11 01:51:59,291 INFO MainThread:382 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 636420096 - > -2025-06-11 01:51:59,291 INFO MainThread:382 [wandb_run.py:_config_callback():1327] config_cb model/num_parameters 636420096 None -2025-06-11 01:51:59,292 INFO MainThread:382 [wandb_run.py:_config_callback():1327] config_cb None None {'model_args': {'model_name_or_path': 'Qwen/Qwen3-0.6B', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'AUTO', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_cache': True, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': 'Youssef/QWEN_Arabic_Q&A', 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 4096, 'block_diag_attn': False}, 'data_args': {'template': 'qwen', 'dataset': ['QAtrain'], 'eval_dataset': ['QAval'], 'dataset_dir': 'data', 'media_dir': 'data', 'cutoff_len': 4096, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': True, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 32, 'max_samples': None, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': False, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': True, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'lora_rank': 64, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'create_new_adapter': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'sft', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_muon': False, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():809] starting backend +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():813] sending inform_init request +2025-06-18 02:04:45,163 INFO MainThread:294 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-18 02:04:45,163 INFO MainThread:294 [wandb_init.py:init():823] backend started and connected +2025-06-18 02:04:45,172 INFO MainThread:294 [wandb_init.py:init():915] updated telemetry +2025-06-18 02:04:45,188 INFO MainThread:294 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout +2025-06-18 02:04:45,709 INFO MainThread:294 [wandb_init.py:init():1014] starting run threads in backend +2025-06-18 02:04:46,481 INFO MainThread:294 [wandb_run.py:_console_start():2454] atexit reg +2025-06-18 02:04:46,482 INFO MainThread:294 [wandb_run.py:_redirect():2306] redirect: wrap_raw +2025-06-18 02:04:46,482 INFO MainThread:294 [wandb_run.py:_redirect():2371] Wrapping output streams. +2025-06-18 02:04:46,482 INFO MainThread:294 [wandb_run.py:_redirect():2394] Redirects installed. +2025-06-18 02:04:46,499 INFO MainThread:294 [wandb_init.py:init():1056] run started, returning control to user process +2025-06-18 02:04:46,503 INFO MainThread:294 [wandb_run.py:_config_callback():1327] config_cb None None {'peft_config': {'default': {'task_type': , 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'revision': None, 'inference_mode': False, 'r': 64, 'target_modules': {'q_proj', 'up_proj', 'down_proj', 'k_proj', 'gate_proj', 'v_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 151936, 'max_position_embeddings': 131072, 'hidden_size': 1536, 'intermediate_size': 8960, 'num_hidden_layers': 28, 'num_attention_heads': 12, 'use_sliding_window': False, 'sliding_window': 131072, 'max_window_layers': 28, 'num_key_value_heads': 2, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen2.5-1.5B', '_attn_implementation_autoset': True, 'transformers_version': '4.51.3', 'model_type': 'qwen2', 'use_mrope': False, 'output_dir': '/kaggle/working/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 16, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/kaggle/working/runs/Jun18_02-03-42_79b2ce5216f6', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 200, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'Qwennn', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'tp_size': 0, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': '/kaggle/working/Model/last-checkpoint', 'hub_model_id': None, 'hub_strategy': 'checkpoint', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2048, 'generation_num_beams': None, 'generation_config': None, 'ray_run_name': None, 'ray_storage_path': './saves', 'ray_storage_filesystem': None, 'ray_num_workers': 1, 'resources_per_worker': {'GPU': 1}, 'placement_strategy': 'PACK', 'ray_init_kwargs': None} +2025-06-18 02:04:46,536 INFO MainThread:294 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 1617573376 - > +2025-06-18 02:04:46,536 INFO MainThread:294 [wandb_run.py:_config_callback():1327] config_cb model/num_parameters 1617573376 None +2025-06-18 02:04:46,542 INFO MainThread:294 [wandb_run.py:_config_callback():1327] config_cb None None {'model_args': {'model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'AUTO', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_cache': True, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': 'Youssef/QWEN_Arabic_Q&A', 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2048, 'block_diag_attn': False}, 'data_args': {'template': 'qwen', 'dataset': ['QAtrain'], 'eval_dataset': ['QAval'], 'dataset_dir': 'data', 'media_dir': 'data', 'cutoff_len': 2048, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': True, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 8, 'max_samples': None, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': False, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': True, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'lora_rank': 64, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'create_new_adapter': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'sft', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_muon': False, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} diff --git a/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/files/output.log b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..226cc11a542c94dfbb6dfa8ec9447a8318eee520 --- /dev/null +++ b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/files/output.log @@ -0,0 +1,52 @@ + 39%|█████████████▎ | 1000/2547 [8:45:44<29:09:43, 67.86s/it][INFO|trainer.py:4307] 2025-06-18 10:50:30,573 >> +{'loss': 0.6122, 'grad_norm': 0.6072946190834045, 'learning_rate': 9.599483839268026e-05, 'epoch': 0.65} +{'loss': 0.5809, 'grad_norm': 0.6030572652816772, 'learning_rate': 9.454410179022932e-05, 'epoch': 0.71} +{'loss': 0.5446, 'grad_norm': 0.5781008005142212, 'learning_rate': 9.288422825194501e-05, 'epoch': 0.77} +{'loss': 0.5339, 'grad_norm': 0.5412103533744812, 'learning_rate': 9.102301097269974e-05, 'epoch': 0.82} +{'loss': 0.5296, 'grad_norm': 0.5678456425666809, 'learning_rate': 8.896918846697821e-05, 'epoch': 0.88} +{'loss': 0.5176, 'grad_norm': 0.525556206703186, 'learning_rate': 8.673240354108538e-05, 'epoch': 0.94} +{'loss': 0.5104, 'grad_norm': 1.9685856103897095, 'learning_rate': 8.432315801965616e-05, 'epoch': 1.0} +{'loss': 0.4685, 'grad_norm': 0.6006094217300415, 'learning_rate': 8.175276343902802e-05, 'epoch': 1.06} +{'loss': 0.473, 'grad_norm': 0.5228903889656067, 'learning_rate': 7.903328793897418e-05, 'epoch': 1.12} +{'loss': 0.4679, 'grad_norm': 0.5006899237632751, 'learning_rate': 7.6177499602143e-05, 'epoch': 1.18} +***** Running Evaluation ***** +[INFO|trainer.py:4309] 2025-06-18 10:50:30,573 >> Num examples = 3020 +[INFO|trainer.py:4312] 2025-06-18 10:50:30,573 >> Batch size = 1 + 39%|█████████████▎ | 1000/2547 [9:21:56<29:09:43, 67.86s/it][INFO|trainer.py:3984] 2025-06-18 11:26:43,032 >> Saving model checkpoint to /kaggle/working/checkpoint-1000 +[INFO|configuration_utils.py:693] 2025-06-18 11:26:43,310 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/config.json +{'eval_loss': 0.4844963848590851, 'eval_runtime': 2172.4438, 'eval_samples_per_second': 1.39, 'eval_steps_per_second': 0.695, 'epoch': 1.18} +[INFO|configuration_utils.py:765] 2025-06-18 11:26:43,313 >> Model config Qwen2Config { + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151643, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 131072, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": true, + "use_mrope": false, + "use_sliding_window": false, + "vocab_size": 151936 +} + +[INFO|tokenization_utils_base.py:2510] 2025-06-18 11:26:44,023 >> tokenizer config file saved in /kaggle/working/checkpoint-1000/tokenizer_config.json +[INFO|tokenization_utils_base.py:2519] 2025-06-18 11:26:44,024 >> Special tokens file saved in /kaggle/working/checkpoint-1000/special_tokens_map.json +[INFO|tokenization_utils_base.py:2510] 2025-06-18 11:26:45,701 >> tokenizer config file saved in /kaggle/working/tokenizer_config.json +[INFO|tokenization_utils_base.py:2519] 2025-06-18 11:26:45,702 >> Special tokens file saved in /kaggle/working/special_tokens_map.json +It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`huggingface-cli upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder. diff --git a/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/files/requirements.txt b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b3b0633e48d811eb2563745df3eedf46d2965fa --- /dev/null +++ b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/files/requirements.txt @@ -0,0 +1,897 @@ +psutil==7.0.0 +setproctitle==1.2.2 +colorama==0.4.6 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-curand-cu12==10.3.5.147 +tomlkit==0.13.3 +sse-starlette==2.3.6 +pydantic==2.10.6 +gradio_client==1.10.1 +tyro==0.8.14 +fastapi==0.115.13 +starlette==0.46.2 +fsspec==2025.3.0 +pydantic_core==2.27.2 +fire==0.7.0 +nvidia-cudnn-cu12==9.1.0.70 +av==14.4.0 +gradio==5.31.0 +semantic-version==2.10.0 +python-multipart==0.0.20 +groovy==0.1.2 +shtab==1.7.2 +llamafactory==0.9.4.dev0 +uvicorn==0.34.3 +ffmpy==0.6.0 +safehttpx==0.1.6 +nvidia-cublas-cu12==12.4.5.8 +ruff==0.12.0 +nvidia-nvjitlink-cu12==12.4.127 +trl==0.9.6 +nvidia-cufft-cu12==11.2.1.3 +google-cloud-bigquery==3.25.0 +bq_helper==0.4.1 +joblib==1.5.0 +nltk==3.9.1 +regex==2024.11.6 +click==8.1.8 +tqdm==4.67.1 +lightgbm==4.6.0 +siphash24==1.7 +pytools==2025.1.3 +pycuda==2025.1 +gensim==4.3.3 +torchtune==0.6.1 +tbb==2022.1.0 +mkl==2025.1.0 +tbb4py==2022.1.0 +shapely==2.1.0 +libpysal==4.9.2 +intel-cmplr-lib-ur==2024.2.0 +intel-cmplr-lib-rt==2024.2.0 +mkl-umath==0.1.1 +mkl-service==2.4.1 +mkl-random==1.2.4 +numpy==1.26.4 +intel-openmp==2024.2.0 +mkl-fft==1.3.8 +pynvjitlink-cu12==0.5.2 +tblib==3.1.0 +psutil==7.0.0 +raft-dask-cu12==25.2.0 +partd==1.4.2 +treelite==4.4.1 +dask==2024.12.1 +cupy-cuda12x==13.4.1 +cuda-python==12.9.0 +pynvml==12.0.0 +ucx-py-cu12==0.42.0 +libcudf-cu12==25.2.2 +nvidia-nvcomp-cu12==4.2.0.11 +numba-cuda==0.2.0 +libcuml-cu12==25.2.1 +msgpack==1.1.0 +importlib_metadata==8.7.0 +fastrlock==0.8.3 +libkvikio-cu12==25.2.1 +distributed==2024.12.1 +libcuvs-cu12==25.2.1 +libucx-cu12==1.18.1 +MarkupSafe==3.0.2 +dask-cudf-cu12==25.2.2 +dask-expr==1.1.21 +rich==14.0.0 +dask-cuda==25.2.0 +zict==3.0.0 +toolz==1.0.0 +cuml-cu12==25.2.1 +pylibcudf-cu12==25.2.2 +locket==1.0.0 +nvidia-ml-py==12.575.51 +packaging==25.0 +scipy==1.15.2 +zipp==3.21.0 +python-dateutil==2.9.0.post0 +markdown-it-py==3.0.0 +tzdata==2025.2 +mdurl==0.1.2 +six==1.17.0 +pylibraft-cu12==25.2.0 +rapids-dask-dependency==25.2.0 +numba==0.60.0 +urllib3==2.4.0 +cloudpickle==3.1.1 +nvtx==0.2.11 +cudf-cu12==25.2.2 +llvmlite==0.43.0 +cuda-bindings==12.9.0 +pandas==2.2.3 +Pygments==2.19.1 +pytz==2025.2 +cachetools==5.5.2 +Jinja2==3.1.6 +rmm-cu12==25.2.0 +libucxx-cu12==0.42.0 +PyYAML==6.0.2 +tornado==6.4.2 +cuvs-cu12==25.2.1 +libraft-cu12==25.2.0 +ucxx-cu12==0.42.0 +sortedcontainers==2.4.0 +typing_extensions==4.13.2 +pyarrow==19.0.1 +distributed-ucxx-cu12==0.42.0 +learntools==0.3.5 +pycparser==2.22 +annotated-types==0.7.0 +charset-normalizer==3.4.2 +kagglehub==0.3.12 +grpcio-status==1.49.0rc1 +frozenlist==1.6.0 +protobuf==3.20.3 +dnspython==2.7.0 +attrs==25.3.0 +in-toto-attestation==0.9.3 +typing-inspection==0.4.0 +id==1.5.0 +rsa==4.9.1 +PyJWT==2.10.1 +pyOpenSSL==25.0.0 +idna==3.10 +email_validator==2.2.0 +cffi==1.17.1 +certifi==2025.4.26 +sigstore==3.6.2 +multiprocess==0.70.16 +google-cloud-automl==1.0.1 +model-signing==1.0.1 +aiohttp==3.11.18 +aiohappyeyeballs==2.6.1 +googleapis-common-protos==1.70.0 +grpclib==0.4.8 +pyasn1==0.6.1 +huggingface-hub==0.31.1 +filelock==3.18.0 +pyasn1_modules==0.4.2 +hpack==4.1.0 +xxhash==3.5.0 +multidict==6.4.3 +propcache==0.3.1 +sigstore-protobuf-specs==0.3.2 +platformdirs==4.3.8 +rfc3161-client==1.0.1 +requests==2.32.3 +cryptography==44.0.3 +aiosignal==1.3.2 +yarl==1.20.0 +google-auth==2.40.1 +betterproto==2.0.0b6 +google-api-core==1.34.1 +datasets==3.6.0 +securesystemslib==1.3.0 +hyperframe==6.1.0 +rfc8785==0.1.4 +sigstore-rekor-types==0.0.18 +tuf==6.0.0 +grpcio==1.72.0rc1 +h2==4.2.0 +hf-xet==1.1.0 +dill==0.3.8 +tsfresh==0.21.0 +fiona==1.10.1 +urwid_readline==0.15.1 +coverage==7.8.0 +Wand==0.6.13 +xvfbwrapper==0.2.13 +qgrid==1.3.1 +jupyter_client==8.6.3 +woodwork==0.31.0 +overrides==7.7.0 +y-py==0.6.2 +ipywidgets==8.1.5 +ydata-profiling==4.16.1 +hep_ml==0.7.3 +scikit-multilearn==0.2.0 +urwid==3.0.2 +cytoolz==1.0.1 +pytesseract==0.3.13 +click-plugins==1.1.1 +onnx==1.17.0 +odfpy==1.4.1 +mpld3==0.5.10 +Boruta==0.4.3 +docstring-to-markdown==0.17 +fqdn==1.5.1 +torchinfo==1.8.0 +clint==0.5.1 +pybind11==2.13.6 +torchao==0.10.0 +PyWavelets==1.8.0 +python-lsp-server==1.12.2 +jupyter_server_terminals==0.5.3 +keras-core==0.1.7 +pandas-profiling==3.6.6 +asttokens==3.0.0 +scikit-surprise==1.1.4 +vtk==9.3.1 +jupyter-ydoc==0.2.5 +aiofiles==22.1.0 +transformers==4.51.3 +isoduration==20.11.0 +featuretools==1.31.0 +plotly-express==0.4.1 +pycryptodomex==3.22.0 +types-python-dateutil==2.9.0.20241206 +easyocr==1.7.2 +openslide-python==1.4.2 +slicer==0.0.7 +ray==2.46.0 +ImageHash==4.3.1 +pyemd==1.0.0 +fuzzywuzzy==0.18.0 +pyparsing==3.0.9 +xgboost==2.0.3 +pandasql==0.7.3 +update-checker==0.18.0 +pathos==0.3.2 +jupyter_server_fileid==0.9.3 +fasttext==0.9.3 +stopit==1.1.2 +haversine==2.9.0 +pox==0.3.6 +catboost==1.2.8 +colorlog==6.9.0 +jupyter_server==2.12.5 +geojson==3.2.0 +uri-template==1.3.0 +notebook==6.5.4 +pytorch-ignite==0.5.2 +fury==0.12.0 +igraph==0.11.8 +kornia_rs==0.1.9 +google-cloud-vision==3.10.1 +olefile==0.47 +semver==3.0.4 +gymnasium==0.29.0 +nvidia-cuda-cupti-cu12==12.4.127 +TPOT==0.12.1 +google-cloud-translate==3.12.1 +tensorflow-cloud==0.1.5 +torchdata==0.11.0 +shap==0.44.1 +rtree==1.4.0 +ghapi==1.0.6 +ninja==1.11.1.4 +torchmetrics==1.7.1 +pygltflib==1.16.4 +Cartopy==0.24.1 +nbdev==2.3.36 +jupyter-lsp==1.5.1 +pycryptodome==3.22.0 +gpxpy==1.6.2 +orderly-set==5.4.1 +pymongo==4.12.1 +mlcrate==0.2.0 +papermill==2.6.0 +jupyterlab==3.6.8 +args==0.1.0 +typing-inspect==0.9.0 +omegaconf==2.3.0 +PyUpSet==0.1.1.post7 +dacite==1.9.2 +qtconsole==5.6.1 +visions==0.8.1 +trx-python==0.3 +Chessnut==0.4.1 +beartype==0.20.2 +deap==1.4.3 +lml==0.2.0 +jmespath==1.0.1 +jupyterlab_server==2.27.3 +ypy-websocket==0.8.4 +ansicolors==1.1.8 +tensorflow_decision_forests==1.11.0 +path.py==12.5.0 +blobfile==3.0.0 +tensorflow-io==0.37.1 +pymc3==3.11.4 +wavio==0.0.9 +cligj==0.7.2 +pdf2image==1.17.0 +dipy==1.11.0 +pyaml==25.1.0 +pypdf==5.4.0 +line_profiler==4.2.0 +pydub==0.25.1 +botocore==1.38.11 +google-cloud-videointelligence==2.16.1 +pyLDAvis==3.4.1 +antlr4-python3-runtime==4.9.3 +Janome==0.5.0 +langid==1.1.6 +simpleitk==2.5.0 +pyclipper==1.3.0.post6 +kornia==0.8.1 +scikit-plot==0.3.7 +pydegensac==0.1.2 +jupyter_server_ydoc==0.8.0 +phik==0.12.4 +keras-tuner==1.4.7 +colorama==0.4.6 +scikit-learn-intelex==2025.5.0 +json5==0.12.0 +PyArabic==0.6.15 +ydf==0.9.0 +ujson==5.10.0 +boto3==1.38.11 +alembic==1.15.2 +annoy==1.17.3 +h2o==3.46.0.7 +optuna==4.3.0 +Pympler==1.1 +s3fs==0.4.2 +geopandas==0.14.4 +nbconvert==6.4.5 +scikit-learn==1.2.2 +emoji==2.14.1 +watchdog==6.0.0 +funcy==2.0 +deepdiff==8.4.2 +testpath==0.6.0 +rfc3986-validator==0.1.1 +nvidia-cuda-runtime-cu12==12.4.127 +nbclient==0.5.13 +Theano==1.0.5 +wurlitzer==3.1.1 +python-bidi==0.6.6 +pudb==2025.1 +plum-dispatch==2.5.7 +pytorch-lightning==2.5.1.post0 +squarify==0.4.4 +comm==0.2.2 +dataclasses-json==0.6.7 +jupyter-events==0.12.0 +pettingzoo==1.24.0 +lightning-utilities==0.14.3 +nilearn==0.10.4 +segment_anything==1.0 +kaggle-environments==1.16.11 +marshmallow==3.26.1 +eli5==0.13.0 +widgetsnbextension==4.0.14 +rgf-python==3.12.0 +ipympl==0.9.7 +tiktoken==0.9.0 +stable-baselines3==2.1.0 +nvidia-cuda-nvrtc-cu12==12.4.127 +jedi==0.19.2 +jupyterlab-lsp==3.10.2 +python-lsp-jsonrpc==1.1.2 +aiosqlite==0.21.0 +QtPy==2.4.3 +pydicom==3.0.1 +multimethod==1.12 +docker==7.1.0 +ppft==1.7.7 +arrow==1.3.0 +isoweek==1.3.3 +texttable==1.7.0 +daal==2025.5.0 +sphinx-rtd-theme==0.2.4 +kt-legacy==1.0.5 +puremagic==1.29 +seaborn==0.12.2 +pyexcel-io==0.6.7 +matplotlib==3.7.2 +Shimmy==1.3.0 +rfc3339-validator==0.1.4 +category_encoders==2.7.0 +stumpy==1.13.0 +mamba==0.11.3 +path==17.1.0 +pyexcel-ods==0.6.0 +preprocessing==0.1.13 +lime==0.2.0.1 +htmlmin==0.1.12 +s3transfer==0.12.0 +cesium==0.12.4 +python-json-logger==3.3.0 +Theano-PyMC==1.1.2 +bayesian-optimization==2.0.3 +keras-cv==0.9.0 +gatspy==0.3 +hf_transfer==0.1.9 +scikit-optimize==0.10.2 +mne==1.9.0 +Mako==1.3.10 +mypy_extensions==1.1.0 +mistune==0.8.4 +setuptools-scm==8.3.1 +execnb==0.1.14 +openslide-bin==4.0.0.8 +google-colab==1.0.0 +mizani==0.13.2 +astunparse==1.6.3 +google-cloud-iam==2.18.3 +ipython==7.34.0 +jax==0.5.2 +pymc==5.21.2 +referencing==0.36.2 +roman-numerals-py==3.1.0 +soxr==0.5.0.post1 +libclang==18.1.1 +keras-nlp==0.18.1 +imageio==2.37.0 +geemap==0.35.3 +google-cloud-firestore==2.20.1 +clarabel==0.10.0 +h11==0.14.0 +db-dtypes==1.4.2 +imagesize==1.4.1 +py-cpuinfo==9.0.0 +debugpy==1.8.0 +stringzilla==3.12.3 +jupyterlab_pygments==0.3.0 +backcall==0.2.0 +tensorflow-hub==0.16.1 +earthengine-api==1.5.9 +requests-oauthlib==2.0.0 +scooby==0.10.0 +opencv-python-headless==4.11.0.86 +dopamine_rl==4.1.2 +etils==1.12.2 +setproctitle==1.3.5 +wandb==0.19.9 +sklearn-compat==0.1.3 +ipython-genutils==0.2.0 +catalogue==2.0.10 +sphinxcontrib-devhelp==2.0.0 +sklearn-pandas==2.2.0 +Markdown==3.7 +sphinxcontrib-qthelp==2.0.0 +google-auth-httplib2==0.2.0 +Flask==3.1.0 +preshed==3.0.9 +google-cloud-resource-manager==1.14.2 +marisa-trie==1.2.1 +google-cloud-core==2.4.3 +ipyleaflet==0.19.2 +chardet==5.2.0 +jupyter_core==5.7.2 +simple-parsing==0.1.7 +matplotlib-venn==1.1.2 +gin-config==0.5.0 +SQLAlchemy==2.0.40 +ipython-sql==0.5.0 +toml==0.10.2 +kaggle==1.7.4.2 +jsonpointer==3.0.0 +ndindex==1.9.2 +astropy-iers-data==0.2025.3.31.0.36.18 +proglog==0.1.11 +tensorflow-io-gcs-filesystem==0.37.1 +simplejson==3.20.1 +datascience==0.17.6 +alabaster==1.0.0 +langchain-text-splitters==0.3.7 +pygit2==1.17.0 +pyshp==2.3.1 +PyGObject==3.42.0 +pytest==8.3.5 +gspread==6.2.0 +spacy-legacy==3.0.12 +diffusers==0.32.2 +librosa==0.11.0 +ibis-framework==9.5.0 +fastcore==1.7.29 +requests-toolbelt==1.0.0 +types-pytz==2025.2.0.20250326 +PyDrive==1.3.1 +google-cloud-functions==1.20.2 +imutils==0.5.4 +sentence-transformers==3.4.1 +opt_einsum==3.4.0 +moviepy==1.0.3 +en_core_web_sm==3.8.0 +langchain-core==0.3.50 +nbclassic==1.2.0 +importlib_resources==6.5.2 +xarray-einstats==0.8.0 +lazy_loader==0.4 +ipyevents==2.0.2 +immutabledict==4.2.1 +music21==9.3.0 +openai==1.70.0 +sqlglot==25.20.2 +ale-py==0.10.2 +linkify-it-py==2.0.3 +scikit-image==0.25.2 +language_data==1.3.0 +treescope==0.1.9 +nvidia-cuda-nvcc-cu12==12.5.82 +libcugraph-cu12==25.2.0 +google-crc32c==1.7.1 +google-cloud-language==2.17.1 +torchsummary==1.5.1 +webencodings==0.5.1 +webcolors==24.11.1 +pydot==3.0.4 +orbax-checkpoint==0.11.10 +google-cloud-dataproc==5.18.1 +jellyfish==1.1.0 +gym==0.25.2 +flax==0.10.5 +cramjam==2.9.1 +gdown==5.2.0 +httpimport==1.4.1 +pymystem3==0.2.0 +parso==0.8.4 +py4j==0.10.9.7 +nx-cugraph-cu12==25.2.0 +entrypoints==0.4 +fastprogress==1.0.3 +torchaudio==2.6.0+cu124 +pyogrio==0.10.0 +bigframes==1.42.0 +oauthlib==3.2.2 +tifffile==2025.3.30 +firebase-admin==6.7.0 +fastjsonschema==2.21.1 +psycopg2==2.9.10 +missingno==0.5.2 +pandas-datareader==0.10.0 +google-spark-connect==0.5.2 +Deprecated==1.2.18 +pooch==1.8.2 +cycler==0.12.1 +tensorboard==2.18.0 +tcmlib==1.3.0 +pyproj==3.7.1 +arviz==0.21.0 +duckdb==1.2.1 +inflect==7.5.0 +argon2-cffi-bindings==21.2.0 +namex==0.0.8 +nvidia-nccl-cu12==2.21.5 +rpy2==3.5.17 +torch==2.6.0+cu124 +argon2-cffi==23.1.0 +opencv-contrib-python==4.11.0.86 +atpublic==5.1 +sphinxcontrib-applehelp==2.0.0 +google-cloud-spanner==3.53.0 +langsmith==0.3.23 +umap-learn==0.5.7 +yfinance==0.2.55 +bleach==6.2.0 +langchain==0.3.22 +jax-cuda12-plugin==0.5.1 +optree==0.14.1 +defusedxml==0.7.1 +sphinxcontrib-serializinghtml==2.0.0 +more-itertools==10.6.0 +python-utils==3.9.1 +timm==1.0.15 +Pyomo==6.8.2 +pydotplus==2.0.2 +ml-dtypes==0.4.1 +peewee==3.17.9 +google-pasta==0.2.0 +pyzmq==24.0.1 +cmdstanpy==1.2.5 +ipyparallel==8.8.0 +parsy==2.1 +bqplot==0.12.44 +spacy-loggers==1.0.5 +google-ai-generativelanguage==0.6.15 +panel==1.6.2 +prophet==1.1.6 +pydata-google-auth==1.9.1 +anyio==4.9.0 +absl-py==1.4.0 +openpyxl==3.1.5 +vega-datasets==0.9.0 +mpmath==1.3.0 +frozendict==2.4.6 +opencv-python==4.11.0.86 +cudf-polars-cu12==25.2.2 +folium==0.19.5 +mdit-py-plugins==0.4.2 +zstandard==0.23.0 +google-cloud-aiplatform==1.87.0 +langcodes==3.5.0 +pytensor==2.30.2 +blinker==1.9.0 +xyzservices==2025.1.0 +googledrivedownloader==1.1.0 +thinc==8.3.4 +google-generativeai==0.8.4 +et_xmlfile==2.0.0 +jieba==0.42.1 +pluggy==1.5.0 +hyperopt==0.2.7 +python-louvain==0.16 +google-auth-oauthlib==1.2.1 +soupsieve==2.6 +PyDrive2==1.21.3 +simsimd==6.2.1 +umf==0.10.0 +peft==0.14.0 +imbalanced-learn==0.13.0 +wcwidth==0.2.13 +narwhals==1.33.0 +typeguard==4.4.2 +blosc2==3.2.1 +spanner-graph-notebook==1.1.6 +progressbar2==4.5.0 +pexpect==4.9.0 +ptyprocess==0.7.0 +pygame==2.6.1 +docker-pycreds==0.4.0 +Cython==3.0.12 +shellingham==1.5.4 +jiter==0.9.0 +CacheControl==0.14.2 +prometheus_client==0.21.1 +nbformat==5.10.4 +python-snappy==0.7.3 +torchvision==0.21.0+cu124 +tensorflow-metadata==1.17.0 +nest-asyncio==1.6.0 +nibabel==5.3.2 +cmake==3.31.6 +multipledispatch==1.0.0 +tf_keras==2.18.0 +cloudpathlib==0.21.0 +networkx==3.4.2 +gcsfs==2025.3.2 +sentencepiece==0.2.0 +einops==0.8.1 +plotly==5.24.1 +bokeh==3.6.3 +pycairo==1.27.0 +ipytree==0.2.2 +python-box==7.3.2 +tensorflow-datasets==4.9.8 +graphviz==0.20.3 +scs==3.2.7.post2 +pillow==11.1.0 +google-api-python-client==2.164.0 +textblob==0.19.0 +PyOpenGL==3.1.9 +google-cloud-bigtable==2.30.0 +decorator==4.4.2 +google-cloud-datastore==2.20.2 +docstring_parser==0.16 +pickleshare==0.7.5 +fastai==2.7.19 +wrapt==1.17.2 +google-cloud-storage==2.19.0 +GDAL==3.6.4 +wasabi==1.1.3 +spacy==3.8.5 +blis==1.2.1 +tensorflow-text==2.18.1 +optax==0.2.4 +gast==0.6.0 +Werkzeug==3.1.3 +colorcet==3.1.0 +python-slugify==8.0.4 +cvxpy==1.6.4 +miniKanren==1.0.3 +traitlets==5.7.1 +sqlparse==0.5.3 +terminado==0.18.1 +holidays==0.69 +sphinxcontrib-htmlhelp==2.1.0 +orjson==3.10.16 +grpc-interceptor==0.15.4 +geocoder==1.38.1 +pyviz_comms==3.0.4 +babel==2.17.0 +jax-cuda12-pjrt==0.5.1 +ply==3.11 +audioread==3.0.1 +docutils==0.21.2 +osqp==1.0.3 +distro==1.9.0 +tf-slim==1.1.0 +tokenizers==0.21.1 +tzlocal==5.3.1 +cons==0.4.6 +rpds-py==0.24.0 +geographiclib==2.0 +matplotlib-inline==0.1.7 +editdistance==0.8.1 +httpcore==1.0.7 +h5py==3.13.0 +tabulate==0.9.0 +statsmodels==0.14.4 +holoviews==1.20.2 +sentry-sdk==2.25.1 +dlib==19.24.6 +community==1.0.0b1 +bigquery-magics==0.9.0 +gym-notices==0.0.8 +notebook_shim==0.2.4 +soundfile==0.13.1 +pyspark==3.5.5 +itsdangerous==2.2.0 +jsonpatch==1.33 +plotnine==0.14.5 +prompt_toolkit==3.0.50 +traittypes==0.2.1 +autograd==1.7.0 +text-unidecode==1.3 +pycocotools==2.0.8 +jsonpickle==4.0.5 +weasel==0.4.1 +srsly==2.5.1 +wordcloud==1.9.4 +eerepr==0.1.1 +cymem==2.0.11 +smart-open==7.1.0 +patsy==1.0.1 +beautifulsoup4==4.13.3 +opentelemetry-sdk==1.31.1 +tables==3.10.2 +altair==5.5.0 +grpc-google-iam-v1==0.14.2 +cufflinks==0.17.3 +cvxopt==1.3.2 +triton==3.2.0 +PySocks==1.7.1 +uc-micro-py==1.0.3 +proto-plus==1.26.1 +Sphinx==8.2.3 +fonttools==4.57.0 +xlrd==2.0.1 +pynndescent==0.5.13 +numexpr==2.10.2 +array_record==0.7.1 +h5netcdf==1.6.1 +promise==2.3 +threadpoolctl==3.6.0 +Send2Trash==1.8.3 +sniffio==1.3.1 +httplib2==0.22.0 +jupyterlab_widgets==3.0.13 +chex==0.1.89 +confection==0.1.5 +uritemplate==4.1.1 +stanio==0.5.1 +easydict==1.13 +future==1.0.0 +tensorflow==2.18.0 +websocket-client==1.8.0 +flatbuffers==25.2.10 +Bottleneck==1.4.2 +kiwisolver==1.4.8 +snowballstemmer==2.2.0 +colour==0.1.5 +google-genai==1.9.0 +hdbscan==0.8.40 +sphinxcontrib-jsmath==1.0.1 +google-resumable-media==2.7.2 +murmurhash==1.0.12 +portpicker==1.5.2 +Farama-Notifications==0.0.4 +accelerate==1.5.2 +jaxlib==0.5.1 +sympy==1.13.1 +ipykernel==6.17.1 +pathlib==1.0.1 +websockets==15.0.1 +pandas-stubs==2.2.2.240909 +ratelim==0.1.6 +google-cloud-bigquery-connection==1.18.2 +greenlet==3.1.1 +multitasking==0.0.11 +astropy==7.0.1 +imageio-ffmpeg==0.6.0 +opentelemetry-api==1.31.1 +pyperclip==1.9.0 +jsonschema-specifications==2024.10.1 +tinycss2==1.4.0 +keras==3.8.0 +pylibcugraph-cu12==25.2.0 +tenacity==9.1.2 +cyipopt==1.5.0 +polars==1.21.0 +oauth2client==4.1.3 +typer==0.15.2 +lxml==5.3.1 +etuples==0.3.9 +gspread-dataframe==4.0.0 +albumentations==2.0.5 +geopy==2.4.1 +logical-unification==0.4.6 +natsort==8.4.0 +prettytable==3.16.0 +GitPython==3.1.44 +pyerfa==2.0.1.5 +param==2.2.0 +keras-hub==0.18.1 +xarray==2025.1.2 +pandas-gbq==0.28.0 +google-cloud-pubsub==2.29.0 +gitdb==4.0.12 +safetensors==0.5.3 +httpx==0.28.1 +jsonschema==4.23.0 +nvidia-nvtx-cu12==12.4.127 +albucore==0.0.23 +tweepy==4.15.0 +fastdownload==0.0.7 +highspy==1.9.0 +jupyter-console==6.1.0 +branca==0.8.1 +pandocfilters==1.5.1 +yellowbrick==1.5 +opentelemetry-semantic-conventions==0.52b1 +nvidia-cusparselt-cu12==0.6.2 +contourpy==1.3.1 +tensorboard-data-server==0.7.2 +google==2.0.3 +jupyter-leaflet==0.19.2 +mlxtend==0.23.4 +humanize==4.12.2 +smmap==5.0.2 +tensorstore==0.1.73 +wheel==0.45.1 +glob2==0.7 +tensorflow-probability==0.25.0 +termcolor==3.0.1 +colorlover==0.3.0 +ipyfilechooser==0.6.0 +iniconfig==2.1.0 +dm-tree==0.1.9 +html5lib==1.1 +python-apt==0.0.0 +setuptools==75.2.0 +types-setuptools==78.1.0.20250329 +requirements-parser==0.9.0 +pip==24.1.2 +llamafactory==0.9.4.dev0 +PyGObject==3.42.1 +blinker==1.4 +jeepney==0.7.1 +six==1.16.0 +oauthlib==3.2.0 +wadllib==1.3.6 +launchpadlib==1.10.16 +dbus-python==1.2.18 +PyJWT==2.3.0 +importlib-metadata==4.6.4 +httplib2==0.20.2 +zipp==1.0.0 +pyparsing==2.4.7 +Markdown==3.3.6 +python-apt==2.4.0+ubuntu4 +Mako==1.1.3 +lazr.restfulclient==0.14.4 +SecretStorage==3.3.1 +distro==1.7.0 +lazr.uri==1.0.6 +more-itertools==8.10.0 +MarkupSafe==2.0.1 +cryptography==3.4.8 +keyring==23.5.0 +packaging==24.1 +inflect==7.3.1 +autocommand==2.2.2 +typeguard==4.3.0 +jaraco.text==3.12.1 +importlib_resources==6.4.0 +wheel==0.43.0 +zipp==3.19.2 +platformdirs==4.2.2 +importlib_metadata==8.0.0 +tomli==2.0.1 +jaraco.collections==5.1.0 +more-itertools==10.3.0 +typing_extensions==4.12.2 +backports.tarfile==1.2.0 +jaraco.functools==4.0.1 +jaraco.context==5.3.0 diff --git a/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/files/wandb-metadata.json b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..50436cd890d89c265d7c5405aedc39159716d234 --- /dev/null +++ b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/files/wandb-metadata.json @@ -0,0 +1,51 @@ +{ + "os": "Linux-6.6.56+-x86_64-with-glibc2.35", + "python": "CPython 3.11.11", + "startedAt": "2025-06-18T02:04:45.164105Z", + "args": [ + "examples/train_lora/QA.yaml" + ], + "program": "/kaggle/working/LLaMA-Factory/src/llamafactory/launcher.py", + "codePath": "src/llamafactory/launcher.py", + "git": { + "remote": "https://github.com/hiyouga/LLaMA-Factory.git", + "commit": "0e1fea71d20fa34e76b865f7c0d43915feae39c3" + }, + "email": "youssefhassan437972@gmail.com", + "root": "/kaggle/working/LLaMA-Factory", + "host": "79b2ce5216f6", + "executable": "/usr/bin/python3", + "codePathLocal": "src/llamafactory/launcher.py", + "cpu_count": 2, + "cpu_count_logical": 4, + "gpu": "Tesla T4", + "gpu_count": 2, + "disk": { + "/": { + "total": "8656922775552", + "used": "6838222368768" + } + }, + "memory": { + "total": "33662353408" + }, + "cpu": { + "count": 2, + "countLogical": 4 + }, + "gpu_nvidia": [ + { + "name": "Tesla T4", + "memoryTotal": "16106127360", + "cudaCores": 2560, + "architecture": "Turing" + }, + { + "name": "Tesla T4", + "memoryTotal": "16106127360", + "cudaCores": 2560, + "architecture": "Turing" + } + ], + "cudaVersion": "12.6" +} \ No newline at end of file diff --git a/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-core.log b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..021cdea134861552a0464233837f892d2e2f5dc8 --- /dev/null +++ b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-core.log @@ -0,0 +1,7 @@ +{"time":"2025-06-18T02:01:52.634091483Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpebeg84nw/port-137.txt","pid":137,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false} +{"time":"2025-06-18T02:01:52.651574372Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":137} +{"time":"2025-06-18T02:01:52.655316783Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44933,"Zone":""}} +{"time":"2025-06-18T02:01:52.803574594Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:37922"} +{"time":"2025-06-18T02:04:45.079491429Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:36422"} +{"time":"2025-06-18T02:04:45.169314773Z","level":"INFO","msg":"handleInformInit: received","streamId":"o5waoqcx","id":"127.0.0.1:36422"} +{"time":"2025-06-18T02:04:45.428092479Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"o5waoqcx","id":"127.0.0.1:36422"} diff --git a/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-internal.log b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d68dc2b105686882a2e9846d36d51b3aa2b97dfc --- /dev/null +++ b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-internal.log @@ -0,0 +1,7 @@ +{"time":"2025-06-18T02:04:45.169771821Z","level":"INFO","msg":"stream: starting","core version":"0.19.9","symlink path":"/kaggle/working/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-core.log"} +{"time":"2025-06-18T02:04:45.427972883Z","level":"INFO","msg":"created new stream","id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.428081239Z","level":"INFO","msg":"stream: started","id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.428182138Z","level":"INFO","msg":"sender: started","stream_id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.428270431Z","level":"INFO","msg":"handler: started","stream_id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.428273846Z","level":"INFO","msg":"writer: Do: started","stream_id":"o5waoqcx"} +{"time":"2025-06-18T02:04:45.721935563Z","level":"INFO","msg":"Starting system monitor"} diff --git a/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug.log b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..afdb722aae1e8a6304b2572da6e3ed067d2818bd --- /dev/null +++ b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug.log @@ -0,0 +1,26 @@ +2025-06-18 02:04:45,149 INFO MainThread:294 [wandb_setup.py:_flush():67] Current SDK version is 0.19.9 +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Configure stats pid to 294 +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Loading settings from /root/.config/wandb/settings +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Loading settings from /kaggle/working/LLaMA-Factory/wandb/settings +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_setup.py:_flush():67] Loading settings from environment variables +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:setup_run_log_directory():662] Logging user logs to /kaggle/working/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug.log +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:setup_run_log_directory():663] Logging internal logs to /kaggle/working/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/logs/debug-internal.log +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():781] calling init triggers +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():786] wandb.init called with sweep_config: {} +config: {'_wandb': {}} +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():809] starting backend +2025-06-18 02:04:45,150 INFO MainThread:294 [wandb_init.py:init():813] sending inform_init request +2025-06-18 02:04:45,163 INFO MainThread:294 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-06-18 02:04:45,163 INFO MainThread:294 [wandb_init.py:init():823] backend started and connected +2025-06-18 02:04:45,172 INFO MainThread:294 [wandb_init.py:init():915] updated telemetry +2025-06-18 02:04:45,188 INFO MainThread:294 [wandb_init.py:init():939] communicating run to backend with 90.0 second timeout +2025-06-18 02:04:45,709 INFO MainThread:294 [wandb_init.py:init():1014] starting run threads in backend +2025-06-18 02:04:46,481 INFO MainThread:294 [wandb_run.py:_console_start():2454] atexit reg +2025-06-18 02:04:46,482 INFO MainThread:294 [wandb_run.py:_redirect():2306] redirect: wrap_raw +2025-06-18 02:04:46,482 INFO MainThread:294 [wandb_run.py:_redirect():2371] Wrapping output streams. +2025-06-18 02:04:46,482 INFO MainThread:294 [wandb_run.py:_redirect():2394] Redirects installed. +2025-06-18 02:04:46,499 INFO MainThread:294 [wandb_init.py:init():1056] run started, returning control to user process +2025-06-18 02:04:46,503 INFO MainThread:294 [wandb_run.py:_config_callback():1327] config_cb None None {'peft_config': {'default': {'task_type': , 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'revision': None, 'inference_mode': False, 'r': 64, 'target_modules': {'q_proj', 'up_proj', 'down_proj', 'k_proj', 'gate_proj', 'v_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 151936, 'max_position_embeddings': 131072, 'hidden_size': 1536, 'intermediate_size': 8960, 'num_hidden_layers': 28, 'num_attention_heads': 12, 'use_sliding_window': False, 'sliding_window': 131072, 'max_window_layers': 28, 'num_key_value_heads': 2, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 1000000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'Qwen/Qwen2.5-1.5B', '_attn_implementation_autoset': True, 'transformers_version': '4.51.3', 'model_type': 'qwen2', 'use_mrope': False, 'output_dir': '/kaggle/working/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 16, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0001, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/kaggle/working/runs/Jun18_02-03-42_79b2ce5216f6', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 50, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 200, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'Qwennn', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'tp_size': 0, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': '/kaggle/working/Model/last-checkpoint', 'hub_model_id': None, 'hub_strategy': 'checkpoint', 'hub_token': '', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2048, 'generation_num_beams': None, 'generation_config': None, 'ray_run_name': None, 'ray_storage_path': './saves', 'ray_storage_filesystem': None, 'ray_num_workers': 1, 'resources_per_worker': {'GPU': 1}, 'placement_strategy': 'PACK', 'ray_init_kwargs': None} +2025-06-18 02:04:46,536 INFO MainThread:294 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 1617573376 - > +2025-06-18 02:04:46,536 INFO MainThread:294 [wandb_run.py:_config_callback():1327] config_cb model/num_parameters 1617573376 None +2025-06-18 02:04:46,542 INFO MainThread:294 [wandb_run.py:_config_callback():1327] config_cb None None {'model_args': {'model_name_or_path': 'Qwen/Qwen2.5-1.5B', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'AUTO', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_cache': True, 'infer_dtype': 'auto', 'hf_hub_token': '', 'ms_hub_token': '', 'om_hub_token': '', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': 'Youssef/QWEN_Arabic_Q&A', 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2048, 'block_diag_attn': False}, 'data_args': {'template': 'qwen', 'dataset': ['QAtrain'], 'eval_dataset': ['QAval'], 'dataset_dir': 'data', 'media_dir': 'data', 'cutoff_len': 2048, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': True, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 8, 'max_samples': None, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': False, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': True, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'lora_alpha': 128, 'lora_dropout': 0.0, 'lora_rank': 64, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'create_new_adapter': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'sft', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_muon': False, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}} diff --git a/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/run-o5waoqcx.wandb b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/run-o5waoqcx.wandb new file mode 100644 index 0000000000000000000000000000000000000000..4d953f0bfdc818bbaaaeab6ded7543c37f16a1ed --- /dev/null +++ b/Model/LLaMA-Factory/wandb/run-20250618_020445-o5waoqcx/run-o5waoqcx.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:284d50ee71dee4dfc4be4828d223dee4e61b75e06faf69216360e2ce81c23e1f +size 3244032 diff --git a/Model/Model/.gitattributes b/Model/Model/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..fb5ecab7a422a33732ea9d9cab63b23d83e482c0 --- /dev/null +++ b/Model/Model/.gitattributes @@ -0,0 +1,45 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/assets/wechat.jpg filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/assets/wechat_alaya.png filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/assets/wechat_npu.jpg filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/data/mllm_demo_data/1.mp3 filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/data/mllm_demo_data/1.mp4 filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/data/mllm_demo_data/2.avi filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/data/mllm_demo_data/3.flac filter=lfs diff=lfs merge=lfs -text +LLaMA-Factory/data/mllm_demo_data/3.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/Model/Model/LLaMA-Factory/.dockerignore b/Model/Model/LLaMA-Factory/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..a07ec860d9e8a49793fb478c405607bbdb3bf80e --- /dev/null +++ b/Model/Model/LLaMA-Factory/.dockerignore @@ -0,0 +1,15 @@ +.vscode +.git +.github +.venv +cache +docker +saves +hf_cache +ms_cache +om_cache +shared_data +output +.dockerignore +.gitattributes +.gitignore diff --git a/Model/Model/LLaMA-Factory/.env.local b/Model/Model/LLaMA-Factory/.env.local new file mode 100644 index 0000000000000000000000000000000000000000..88ac8a46f115203f1a3faaeae503f063062fbe0d --- /dev/null +++ b/Model/Model/LLaMA-Factory/.env.local @@ -0,0 +1,42 @@ +# Note: actually we do not support .env, just for reference +# api +API_HOST= +API_PORT= +API_KEY= +API_MODEL_NAME= +API_VERBOSE= +FASTAPI_ROOT_PATH= +MAX_CONCURRENT= +# general +DISABLE_VERSION_CHECK= +FORCE_CHECK_IMPORTS= +ALLOW_EXTRA_ARGS= +LLAMAFACTORY_VERBOSITY= +USE_MODELSCOPE_HUB= +USE_OPENMIND_HUB= +USE_RAY= +RECORD_VRAM= +OPTIM_TORCH= +NPU_JIT_COMPILE= +# torchrun +FORCE_TORCHRUN= +MASTER_ADDR= +MASTER_PORT= +NNODES= +NODE_RANK= +NPROC_PER_NODE= +# wandb +WANDB_DISABLED= +WANDB_PROJECT= +WANDB_API_KEY= +# gradio ui +GRADIO_SHARE= +GRADIO_SERVER_NAME= +GRADIO_SERVER_PORT= +GRADIO_ROOT_PATH= +GRADIO_IPV6= +# setup +ENABLE_SHORT_CONSOLE= +# reserved (do not use) +LLAMABOARD_ENABLED= +LLAMABOARD_WORKDIR= diff --git a/Model/Model/LLaMA-Factory/.gitattributes b/Model/Model/LLaMA-Factory/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..dfe0770424b2a19faf507a501ebfc23be8f54e7b --- /dev/null +++ b/Model/Model/LLaMA-Factory/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/Model/Model/LLaMA-Factory/.github/CODE_OF_CONDUCT.md b/Model/Model/LLaMA-Factory/.github/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..c2035cea5425b8de8e88a563214d05dfd415352a --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +`hoshihiyouga AT gmail DOT com`. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/Model/Model/LLaMA-Factory/.github/CONTRIBUTING.md b/Model/Model/LLaMA-Factory/.github/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..507d666a23fc35f51b931e4f032c6d4b07872a45 --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/CONTRIBUTING.md @@ -0,0 +1,67 @@ +# Contributing to LLaMA Factory + +Everyone is welcome to contribute, and we value everybody's contribution. Code contributions are not the only way to help the community. Answering questions, helping others, and improving the documentation are also immensely valuable. + +It also helps us if you spread the word! Reference the library in blog posts about the awesome projects it made possible, shout out on Twitter every time it has helped you, or simply ⭐️ the repository to say thank you. + +However you choose to contribute, please be mindful and respect our [code of conduct](CODE_OF_CONDUCT.md). + +**This guide was heavily inspired by [transformers guide to contributing](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md).** + +## Ways to contribute + +There are several ways you can contribute to LLaMA Factory: + +* Fix outstanding issues with the existing code. +* Submit issues related to bugs or desired new features. +* Contribute to the examples or to the documentation. + +### Style guide + +LLaMA Factory follows the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html), check it for details. + +### Create a Pull Request + +1. Fork the [repository](https://github.com/hiyouga/LLaMA-Factory) by clicking on the [Fork](https://github.com/hiyouga/LLaMA-Factory/fork) button on the repository's page. This creates a copy of the code under your GitHub user account. + +2. Clone your fork to your local disk, and add the base repository as a remote: + +```bash +git clone git@github.com:[username]/LLaMA-Factory.git +cd LLaMA-Factory +git remote add upstream https://github.com/hiyouga/LLaMA-Factory.git +``` + +3. Create a new branch to hold your development changes: + +```bash +git checkout -b dev_your_branch +``` + +4. Set up a development environment by running the following command in a virtual environment: + +```bash +pip install -e ".[dev]" +``` + +If LLaMA Factory was already installed in the virtual environment, remove it with `pip uninstall llamafactory` before reinstalling it in editable mode with the -e flag. + +5. Check code before commit: + +```bash +make commit +make style && make quality +make test +``` + +6. Submit changes: + +```bash +git add . +git commit -m "commit message" +git fetch upstream +git rebase upstream/main +git push -u origin dev_your_branch +``` + +7. Create a merge request from your branch `dev_your_branch` at [origin repo](https://github.com/hiyouga/LLaMA-Factory). diff --git a/Model/Model/LLaMA-Factory/.github/ISSUE_TEMPLATE/1-bug-report.yml b/Model/Model/LLaMA-Factory/.github/ISSUE_TEMPLATE/1-bug-report.yml new file mode 100644 index 0000000000000000000000000000000000000000..a08596faa5b3be2545412d372f7bdeadca95afb4 --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/ISSUE_TEMPLATE/1-bug-report.yml @@ -0,0 +1,61 @@ +name: "\U0001F41B Bug / help" +description: Create a report to help us improve the LLaMA Factory +labels: ["bug", "pending"] +body: + - type: markdown + attributes: + value: | + Issues included in **[FAQs](https://github.com/hiyouga/LLaMA-Factory/issues/4614)** or those with **insufficient** information may be closed without a response. + 已经包含在 **[常见问题](https://github.com/hiyouga/LLaMA-Factory/issues/4614)** 内或提供信息**不完整**的 issues 可能不会被回复。 + + - type: markdown + attributes: + value: | + Please do not create issues that are not related to framework bugs under this category, use **[Discussions](https://github.com/hiyouga/LLaMA-Factory/discussions/categories/q-a)** instead. + 请勿在此分类下创建和框架 bug 无关的 issues,训练问题求助请使用 **[讨论区](https://github.com/hiyouga/LLaMA-Factory/discussions/categories/q-a)**。 + + - type: checkboxes + id: reminder + attributes: + label: Reminder + description: | + Please ensure you have read the above rules carefully and searched the existing issues (including FAQs). + 请确保您已经认真阅读了上述规则并且搜索过现有的 issues(包括常见问题)。 + + options: + - label: I have read the above rules and searched the existing issues. + required: true + + - type: textarea + id: system-info + validations: + required: true + attributes: + label: System Info + description: | + Please share your system info with us. You can run the command **llamafactory-cli env** and copy-paste its output below. + 请提供您的系统信息。您可以在命令行运行 **llamafactory-cli env** 并将其输出复制到该文本框中。 + + placeholder: llamafactory version, platform, python version, ... + + - type: textarea + id: reproduction + validations: + required: true + attributes: + label: Reproduction + description: | + Please provide entry arguments, error messages and stack traces that reproduces the problem. + 请提供入口参数,错误日志以及异常堆栈以便于我们复现问题。 + + value: | + ```text + Put your message here. + ``` + + - type: textarea + id: others + validations: + required: false + attributes: + label: Others diff --git a/Model/Model/LLaMA-Factory/.github/ISSUE_TEMPLATE/2-feature-request.yml b/Model/Model/LLaMA-Factory/.github/ISSUE_TEMPLATE/2-feature-request.yml new file mode 100644 index 0000000000000000000000000000000000000000..5d72271ebc8db3d10bf7e9c6af209e857566bde6 --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/ISSUE_TEMPLATE/2-feature-request.yml @@ -0,0 +1,41 @@ +name: "\U0001F680 Feature request" +description: Submit a request for a new feature +labels: ["enhancement", "pending"] +body: + - type: markdown + attributes: + value: | + Please do not create issues that are not related to new features under this category. + 请勿在此分类下创建和新特性无关的 issues。 + + - type: checkboxes + id: reminder + attributes: + label: Reminder + description: | + Please ensure you have read the above rules carefully and searched the existing issues. + 请确保您已经认真阅读了上述规则并且搜索过现有的 issues。 + + options: + - label: I have read the above rules and searched the existing issues. + required: true + + - type: textarea + id: description + validations: + required: true + attributes: + label: Description + description: | + A clear and concise description of the feature proposal. + 请详细描述您希望加入的新功能特性。 + + - type: textarea + id: contribution + validations: + required: false + attributes: + label: Pull Request + description: | + Have you already created the relevant PR and submitted the code? + 您是否已经创建了相关 PR 并提交了代码? diff --git a/Model/Model/LLaMA-Factory/.github/ISSUE_TEMPLATE/config.yml b/Model/Model/LLaMA-Factory/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..3ba13e0cec6cbbfd462e9ebf529dd2093148cd69 --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/Model/Model/LLaMA-Factory/.github/PULL_REQUEST_TEMPLATE.md b/Model/Model/LLaMA-Factory/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000000000000000000000000000000000..d23d6be3cfb8e2db888b19becedf075c7aa527be --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,8 @@ +# What does this PR do? + +Fixes # (issue) + +## Before submitting + +- [ ] Did you read the [contributor guideline](https://github.com/hiyouga/LLaMA-Factory/blob/main/.github/CONTRIBUTING.md)? +- [ ] Did you write any new necessary tests? diff --git a/Model/Model/LLaMA-Factory/.github/SECURITY.md b/Model/Model/LLaMA-Factory/.github/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..d34728ebfeb22e9fda2f3e76ff133014b648ab3c --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/SECURITY.md @@ -0,0 +1,7 @@ +# Reporting Security Issues + +To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/hiyouga/LLaMA-Factory/security/advisories/new) tab. + +We will send a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance. + +Report security bugs in third-party modules to the person or team maintaining the module. diff --git a/Model/Model/LLaMA-Factory/.github/workflows/docker.yml b/Model/Model/LLaMA-Factory/.github/workflows/docker.yml new file mode 100644 index 0000000000000000000000000000000000000000..c3c3800591c1f36bdcd83608bc52fdeb0bc635c5 --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/workflows/docker.yml @@ -0,0 +1,66 @@ +name: docker + +on: + workflow_dispatch: + push: + branches: + - "main" + paths: + - "**/*.py" + - "requirements.txt" + - "docker/**" + - ".github/workflows/*.yml" + pull_request: + branches: + - "main" + paths: + - "**/*.py" + - "requirements.txt" + - "docker/**" + - ".github/workflows/*.yml" + +jobs: + build: + runs-on: ubuntu-latest + + concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + + environment: + name: docker + url: https://hub.docker.com/r/hiyouga/llamafactory + + steps: + - name: Free up disk space + run: | + df -h + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache + df -h + + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + username: ${{ vars.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: ./docker/docker-cuda/Dockerfile + build-args: | + EXTRAS=metrics,deepspeed,liger-kernel + push: ${{ github.event_name != 'pull_request' }} + tags: docker.io/hiyouga/llamafactory:latest + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/Model/Model/LLaMA-Factory/.github/workflows/label_issue.yml b/Model/Model/LLaMA-Factory/.github/workflows/label_issue.yml new file mode 100644 index 0000000000000000000000000000000000000000..b7469f6a1fd2adfd1bd868548014b0bf9a22cd1e --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/workflows/label_issue.yml @@ -0,0 +1,32 @@ +name: label_issue + +on: + issues: + types: + - opened + +jobs: + label_issue: + runs-on: ubuntu-latest + + permissions: + issues: write + + steps: + - env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ISSUE_URL: ${{ github.event.issue.html_url }} + ISSUE_TITLE: ${{ github.event.issue.title }} + run: | + LABEL="" + NPU_KEYWORDS=(npu huawei ascend 华为 昇腾) + ISSUE_TITLE_LOWER=$(echo $ISSUE_TITLE | tr '[:upper:]' '[:lower:]') + for KEYWORD in ${NPU_KEYWORDS[@]}; do + if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]]; then + LABEL="npu" + break + fi + done + if [ -n "$LABEL" ]; then + gh issue edit $ISSUE_URL --add-label $LABEL + fi diff --git a/Model/Model/LLaMA-Factory/.github/workflows/publish.yml b/Model/Model/LLaMA-Factory/.github/workflows/publish.yml new file mode 100644 index 0000000000000000000000000000000000000000..c3f729a085d326e29ef6c5bc4912c71278ebdd82 --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/workflows/publish.yml @@ -0,0 +1,36 @@ +name: publish + +on: + workflow_dispatch: + release: + types: + - published + +jobs: + publish: + name: Upload release to PyPI + + runs-on: ubuntu-latest + + environment: + name: release + url: https://pypi.org/p/llamafactory + + permissions: + id-token: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + + - name: Build package + run: | + make build + + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/Model/Model/LLaMA-Factory/.github/workflows/tests.yml b/Model/Model/LLaMA-Factory/.github/workflows/tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..41923092cc9aaa4c6131310790a02b88842155ff --- /dev/null +++ b/Model/Model/LLaMA-Factory/.github/workflows/tests.yml @@ -0,0 +1,102 @@ +name: tests + +on: + workflow_dispatch: + push: + branches: + - "main" + paths: + - "**.py" + - "requirements.txt" + - ".github/workflows/*.yml" + pull_request: + branches: + - "main" + paths: + - "**.py" + - "requirements.txt" + - ".github/workflows/*.yml" + +jobs: + tests: + strategy: + fail-fast: false + matrix: + python: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + os: + - "ubuntu-latest" + - "windows-latest" + - "macos-13" + transformers: + - null + include: # test backward compatibility + - python: "3.9" + os: "ubuntu-latest" + transformers: "4.45.0" + - python: "3.9" + os: "ubuntu-latest" + transformers: "4.49.0" + - python: "3.9" + os: "ubuntu-latest" + transformers: "4.51.0" + + runs-on: ${{ matrix.os }} + + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python }}-${{ matrix.transformers }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + OS_NAME: ${{ matrix.os }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: "pip" + cache-dependency-path: "**/requirements*.txt" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install ".[torch,dev]" + + - name: Install transformers + if: ${{ matrix.transformers }} + run: | + python -m pip install "transformers==${{ matrix.transformers }}" + + - name: Cache files + id: hf-hub-cache + uses: actions/cache@v4 + with: + path: ${{ runner.temp }}/huggingface + key: huggingface-${{ matrix.os }}-${{ matrix.python }}-${{ matrix.transformers }}-${{ hashFiles('tests/version.txt') }} + + - name: Check quality + run: | + make style && make quality + + - name: Check license + run: | + make license + + - name: Check build + run: | + make build + + - name: Test with pytest + run: | + make test + env: + HF_HOME: ${{ runner.temp }}/huggingface + HF_HUB_OFFLINE: "${{ steps.hf-hub-cache.outputs.cache-hit == 'true' && '1' || '0' }}" diff --git a/Model/Model/LLaMA-Factory/.gitignore b/Model/Model/LLaMA-Factory/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0a3a47bd8b251892e9d8db7b6878accc99c1a5bc --- /dev/null +++ b/Model/Model/LLaMA-Factory/.gitignore @@ -0,0 +1,179 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# vscode +.vscode/ + +# uv +uv.lock + +# custom .gitignore +hf_cache/ +ms_cache/ +om_cache/ +cache/ +config/ +saves/ +output/ +wandb/ +swanlog/ +generated_predictions.jsonl +predictions_score.json diff --git a/Model/Model/LLaMA-Factory/.pre-commit-config.yaml b/Model/Model/LLaMA-Factory/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cbe361eeb5ffb7dd31b9203b691316906f0b1fe8 --- /dev/null +++ b/Model/Model/LLaMA-Factory/.pre-commit-config.yaml @@ -0,0 +1,28 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-ast + - id: check-added-large-files + args: ['--maxkb=25000'] + - id: check-merge-conflict + - id: check-yaml + - id: debug-statements + - id: end-of-file-fixer + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + - id: no-commit-to-branch + args: ['--branch', 'main'] + +- repo: https://github.com/asottile/pyupgrade + rev: v3.17.0 + hooks: + - id: pyupgrade + args: [--py38-plus] + +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format diff --git a/Model/Model/LLaMA-Factory/CITATION.cff b/Model/Model/LLaMA-Factory/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..01b4c9fd28aed295d50c71a7a3ed2e97a69434d4 --- /dev/null +++ b/Model/Model/LLaMA-Factory/CITATION.cff @@ -0,0 +1,44 @@ +cff-version: 1.2.0 +date-released: 2024-03 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Zheng" + given-names: "Yaowei" +- family-names: "Zhang" + given-names: "Richong" +- family-names: "Zhang" + given-names: "Junhao" +- family-names: "Ye" + given-names: "Yanhan" +- family-names: "Luo" + given-names: "Zheyan" +- family-names: "Feng" + given-names: "Zhangchi" +- family-names: "Ma" + given-names: "Yongqiang" +title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models" +url: "https://arxiv.org/abs/2403.13372" +preferred-citation: + type: conference-paper + conference: + name: "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)" + authors: + - family-names: "Zheng" + given-names: "Yaowei" + - family-names: "Zhang" + given-names: "Richong" + - family-names: "Zhang" + given-names: "Junhao" + - family-names: "Ye" + given-names: "Yanhan" + - family-names: "Luo" + given-names: "Zheyan" + - family-names: "Feng" + given-names: "Zhangchi" + - family-names: "Ma" + given-names: "Yongqiang" + title: "LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models" + url: "https://arxiv.org/abs/2403.13372" + year: 2024 + publisher: "Association for Computational Linguistics" + address: "Bangkok, Thailand" diff --git a/Model/Model/LLaMA-Factory/LICENSE b/Model/Model/LLaMA-Factory/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..b09cd7856d58590578ee1a4f3ad45d1310a97f87 --- /dev/null +++ b/Model/Model/LLaMA-Factory/LICENSE @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Model/Model/LLaMA-Factory/MANIFEST.in b/Model/Model/LLaMA-Factory/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..82c51f6348a58f9da0c839a61b0063b9aba66d75 --- /dev/null +++ b/Model/Model/LLaMA-Factory/MANIFEST.in @@ -0,0 +1 @@ +include LICENSE requirements.txt diff --git a/Model/Model/LLaMA-Factory/Makefile b/Model/Model/LLaMA-Factory/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..2dcb7caf69b3b85b0c143dc62d2053515a5346a1 --- /dev/null +++ b/Model/Model/LLaMA-Factory/Makefile @@ -0,0 +1,24 @@ +.PHONY: build commit license quality style test + +check_dirs := scripts src tests setup.py + +build: + pip3 install build && python3 -m build + +commit: + pre-commit install + pre-commit run --all-files + +license: + python3 tests/check_license.py $(check_dirs) + +quality: + ruff check $(check_dirs) + ruff format --check $(check_dirs) + +style: + ruff check $(check_dirs) --fix + ruff format $(check_dirs) + +test: + CUDA_VISIBLE_DEVICES= WANDB_DISABLED=true pytest -vv tests/ diff --git a/Model/Model/LLaMA-Factory/README.md b/Model/Model/LLaMA-Factory/README.md new file mode 100644 index 0000000000000000000000000000000000000000..db981f9fef21b3ccf642a5a29ff9046a0b76fc8d --- /dev/null +++ b/Model/Model/LLaMA-Factory/README.md @@ -0,0 +1,942 @@ +![# LLaMA Factory](assets/logo.png) + +[![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/LLaMA-Factory?style=social)](https://github.com/hiyouga/LLaMA-Factory/stargazers) +[![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main) +[![GitHub contributors](https://img.shields.io/github/contributors/hiyouga/LLaMA-Factory?color=orange)](https://github.com/hiyouga/LLaMA-Factory/graphs/contributors) +[![GitHub workflow](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml/badge.svg)](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml) +[![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/) +[![Citation](https://img.shields.io/badge/citation-561-green)](https://scholar.google.com/scholar?cites=12620864006390196564) +[![Docker Pulls](https://img.shields.io/docker/pulls/hiyouga/llamafactory)](https://hub.docker.com/r/hiyouga/llamafactory/tags) + +[![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai) +[![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK) +[![GitCode](https://gitcode.com/zhengyaowei/LLaMA-Factory/star/badge.svg)](https://gitcode.com/zhengyaowei/LLaMA-Factory) + +[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing) +[![Open in DSW](https://gallery.pai-ml.com/assets/open-in-dsw.svg)](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory) +[![Open in Alaya](assets/alaya_new.svg)](https://docs.alayanew.com/docs/documents/newActivities/llamafactory/?utm_source=LLaMA-Factory) +[![Open in Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/hiyouga/LLaMA-Board) +[![Open in Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board) +[![Open in Novita](https://img.shields.io/badge/Novita-Deploy%20Template-blue)](https://novita.ai/templates-library/105981?sharer=88115474-394e-4bda-968e-b88e123d0c47) + +### Used by [Amazon](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/), [NVIDIA](https://developer.nvidia.com/rtx/ai-toolkit), [Aliyun](https://help.aliyun.com/zh/pai/use-cases/fine-tune-a-llama-3-model-with-llama-factory), etc. + +
+ +### Supporters ❤️ + + + Warp sponsorship + + +#### [Warp, the agentic terminal for developers](https://warp.dev/llama-factory) + +[Available for MacOS, Linux, & Windows](https://warp.dev/llama-factory) + +---- + +### Easily fine-tune 100+ large language models with zero-code [CLI](#quickstart) and [Web UI](#fine-tuning-with-llama-board-gui-powered-by-gradio) + +![GitHub Trend](https://trendshift.io/api/badge/repositories/4535) + +
+ +👋 Join our [WeChat group](assets/wechat.jpg), [NPU user group](assets/wechat_npu.jpg) or [Alaya NeW user group](assets/wechat_alaya.png). + +\[ English | [中文](README_zh.md) \] + +**Fine-tuning a large language model can be easy as...** + +https://github.com/user-attachments/assets/3991a3a8-4276-4d30-9cab-4cb0c4b9b99e + +Choose your path: + +- **Documentation**: https://llamafactory.readthedocs.io/en/latest/ +- **Colab (free)**: https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing +- **Local machine**: Please refer to [usage](#getting-started) +- **PAI-DSW (free trial)**: https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory +- **Alaya NeW (cloud GPU deal)**: https://docs.alayanew.com/docs/documents/newActivities/llamafactory/?utm_source=LLaMA-Factory + +> [!NOTE] +> Except for the above links, all other websites are unauthorized third-party websites. Please carefully use them. + +## Table of Contents + +- [Features](#features) +- [Blogs](#blogs) +- [Changelog](#changelog) +- [Supported Models](#supported-models) +- [Supported Training Approaches](#supported-training-approaches) +- [Provided Datasets](#provided-datasets) +- [Requirement](#requirement) +- [Getting Started](#getting-started) + - [Installation](#installation) + - [Data Preparation](#data-preparation) + - [Quickstart](#quickstart) + - [Fine-Tuning with LLaMA Board GUI](#fine-tuning-with-llama-board-gui-powered-by-gradio) + - [Build Docker](#build-docker) + - [Deploy with OpenAI-style API and vLLM](#deploy-with-openai-style-api-and-vllm) + - [Download from ModelScope Hub](#download-from-modelscope-hub) + - [Download from Modelers Hub](#download-from-modelers-hub) + - [Use W&B Logger](#use-wb-logger) + - [Use SwanLab Logger](#use-swanlab-logger) +- [Projects using LLaMA Factory](#projects-using-llama-factory) +- [License](#license) +- [Citation](#citation) +- [Acknowledgement](#acknowledgement) + +## Features + +- **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Qwen2-VL, DeepSeek, Yi, Gemma, ChatGLM, Phi, etc. +- **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO, etc. +- **Scalable resources**: 16-bit full-tuning, freeze-tuning, LoRA and 2/3/4/5/6/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ. +- **Advanced algorithms**: [GaLore](https://github.com/jiaweizzhao/GaLore), [BAdam](https://github.com/Ledzy/BAdam), [APOLLO](https://github.com/zhuhanqing/APOLLO), [Adam-mini](https://github.com/zyushun/Adam-mini), [Muon](https://github.com/KellerJordan/Muon), DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and PiSSA. +- **Practical tricks**: [FlashAttention-2](https://github.com/Dao-AILab/flash-attention), [Unsloth](https://github.com/unslothai/unsloth), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), RoPE scaling, NEFTune and rsLoRA. +- **Wide tasks**: Multi-turn dialogue, tool using, image understanding, visual grounding, video recognition, audio understanding, etc. +- **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, [SwanLab](https://github.com/SwanHubX/SwanLab), etc. +- **Faster inference**: OpenAI-style API, Gradio UI and CLI with [vLLM worker](https://github.com/vllm-project/vllm) or [SGLang worker](https://github.com/sgl-project/sglang). + +### Day-N Support for Fine-Tuning Cutting-Edge Models + +| Support Date | Model Name | +| ------------ | ------------------------------------------------------------ | +| Day 0 | Qwen3 / Qwen2.5-VL / Gemma 3 / InternLM 3 / MiniCPM-o-2.6 | +| Day 1 | Llama 3 / GLM-4 / Mistral Small / PaliGemma2 / Llama 4 | + +## Blogs + +- [Fine-tune Qwen2.5-VL for Autonomous Driving using LLaMA-Factory](https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory) (Chinese) +- [How Apoidea Group enhances visual information extraction from banking documents with multimodal models using LLaMA-Factory on Amazon SageMaker HyperPod](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/) (English) +- [Easy Dataset × LLaMA Factory: Enabling LLMs to Efficiently Learn Domain Knowledge](https://buaa-act.feishu.cn/wiki/GVzlwYcRFiR8OLkHbL6cQpYin7g) (English) + +
All Blogs + +- [LLaMA Factory: Fine-tuning the DeepSeek-R1-Distill-Qwen-7B Model for News Classifier](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_deepseek_r1_distill_7b) (Chinese) +- [A One-Stop Code-Free Model Fine-Tuning \& Deployment Platform based on SageMaker and LLaMA-Factory](https://aws.amazon.com/cn/blogs/china/a-one-stop-code-free-model-fine-tuning-deployment-platform-based-on-sagemaker-and-llama-factory/) (Chinese) +- [LLaMA Factory Multi-Modal Fine-Tuning Practice: Fine-Tuning Qwen2-VL for Personal Tourist Guide](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_qwen2vl) (Chinese) +- [LLaMA Factory: Fine-tuning the LLaMA3 Model for Role-Playing](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory) (Chinese) + +
+ +## Changelog + +[25/04/28] We supported fine-tuning the **[Qwen3](https://qwenlm.github.io/blog/qwen3/)** model family. + +[25/04/21] We supported the **[Muon](https://github.com/KellerJordan/Muon)** optimizer. See [examples](examples/README.md) for usage. Thank [@tianshijing](https://github.com/tianshijing)'s PR. + +[25/04/16] We supported fine-tuning the **[InternVL3](https://huggingface.co/OpenGVLab/InternVL3-8B)** model. See [PR #7258](https://github.com/hiyouga/LLaMA-Factory/pull/7258) to get started. + +[25/04/14] We supported fine-tuning the **[GLM-Z1](https://huggingface.co/THUDM/GLM-Z1-9B-0414)** and **[Kimi-VL](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct)** models. + +[25/04/06] We supported fine-tuning the **[Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/)** model. See [PR #7611](https://github.com/hiyouga/LLaMA-Factory/pull/7611) to get started. + +
Full Changelog + +[25/03/31] We supported fine-tuning the **[Qwen2.5 Omni](https://qwenlm.github.io/blog/qwen2.5-omni/)** model. See [PR #7537](https://github.com/hiyouga/LLaMA-Factory/pull/7537) to get started. + +[25/03/15] We supported **[SGLang](https://github.com/sgl-project/sglang)** as inference backend. Try `infer_backend: sglang` to accelerate inference. + +[25/03/12] We supported fine-tuning the **[Gemma 3](https://huggingface.co/blog/gemma3)** model. + +[25/02/24] Announcing **[EasyR1](https://github.com/hiyouga/EasyR1)**, an efficient, scalable and multi-modality RL training framework for efficient GRPO training. + +[25/02/11] We supported saving the **[Ollama](https://github.com/ollama/ollama)** modelfile when exporting the model checkpoints. See [examples](examples/README.md) for usage. + +[25/02/05] We supported fine-tuning the **[Qwen2-Audio](Qwen/Qwen2-Audio-7B-Instruct)** and **[MiniCPM-o-2.6](https://huggingface.co/openbmb/MiniCPM-o-2_6)** on audio understanding tasks. + +[25/01/31] We supported fine-tuning the **[DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)** and **[Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)** models. + +[25/01/15] We supported **[APOLLO](https://arxiv.org/abs/2412.05270)** optimizer. See [examples](examples/README.md) for usage. + +[25/01/14] We supported fine-tuning the **[MiniCPM-o-2.6](https://huggingface.co/openbmb/MiniCPM-o-2_6)** and **[MiniCPM-V-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6)** models. Thank [@BUAADreamer](https://github.com/BUAADreamer)'s PR. + +[25/01/14] We supported fine-tuning the **[InternLM 3](https://huggingface.co/collections/internlm/)** models. Thank [@hhaAndroid](https://github.com/hhaAndroid)'s PR. + +[25/01/10] We supported fine-tuning the **[Phi-4](https://huggingface.co/microsoft/phi-4)** model. + +[24/12/21] We supported using **[SwanLab](https://github.com/SwanHubX/SwanLab)** for experiment tracking and visualization. See [this section](#use-swanlab-logger) for details. + +[24/11/27] We supported fine-tuning the **[Skywork-o1](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)** model and the **[OpenO1](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT)** dataset. + +[24/10/09] We supported downloading pre-trained models and datasets from the **[Modelers Hub](https://modelers.cn/models)**. See [this tutorial](#download-from-modelers-hub) for usage. + +[24/09/19] We supported fine-tuning the **[Qwen2.5](https://qwenlm.github.io/blog/qwen2.5/)** models. + +[24/08/30] We supported fine-tuning the **[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/)** models. Thank [@simonJJJ](https://github.com/simonJJJ)'s PR. + +[24/08/27] We supported **[Liger Kernel](https://github.com/linkedin/Liger-Kernel)**. Try `enable_liger_kernel: true` for efficient training. + +[24/08/09] We supported **[Adam-mini](https://github.com/zyushun/Adam-mini)** optimizer. See [examples](examples/README.md) for usage. Thank [@relic-yuexi](https://github.com/relic-yuexi)'s PR. + +[24/07/04] We supported [contamination-free packed training](https://github.com/MeetKai/functionary/tree/main/functionary/train/packing). Use `neat_packing: true` to activate it. Thank [@chuan298](https://github.com/chuan298)'s PR. + +[24/06/16] We supported **[PiSSA](https://arxiv.org/abs/2404.02948)** algorithm. See [examples](examples/README.md) for usage. + +[24/06/07] We supported fine-tuning the **[Qwen2](https://qwenlm.github.io/blog/qwen2/)** and **[GLM-4](https://github.com/THUDM/GLM-4)** models. + +[24/05/26] We supported **[SimPO](https://arxiv.org/abs/2405.14734)** algorithm for preference learning. See [examples](examples/README.md) for usage. + +[24/05/20] We supported fine-tuning the **PaliGemma** series models. Note that the PaliGemma models are pre-trained models, you need to fine-tune them with `paligemma` template for chat completion. + +[24/05/18] We supported **[KTO](https://arxiv.org/abs/2402.01306)** algorithm for preference learning. See [examples](examples/README.md) for usage. + +[24/05/14] We supported training and inference on the Ascend NPU devices. Check [installation](#installation) section for details. + +[24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See [examples](examples/README.md) for usage. + +[24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details. + +[24/04/21] We supported **[Mixture-of-Depths](https://arxiv.org/abs/2404.02258)** according to [AstraMindAI's implementation](https://github.com/astramind-ai/Mixture-of-depths). See [examples](examples/README.md) for usage. + +[24/04/16] We supported **[BAdam](https://arxiv.org/abs/2404.02827)** optimizer. See [examples](examples/README.md) for usage. + +[24/04/16] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s long-sequence training (Llama-2-7B-56k within 24GB). It achieves **117%** speed and **50%** memory compared with FlashAttention-2, more benchmarks can be found in [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison). + +[24/03/31] We supported **[ORPO](https://arxiv.org/abs/2403.07691)**. See [examples](examples/README.md) for usage. + +[24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv! + +[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See [examples](examples/README.md) for usage. + +[24/03/13] We supported **[LoRA+](https://arxiv.org/abs/2402.12354)**. See [examples](examples/README.md) for usage. + +[24/03/07] We supported **[GaLore](https://arxiv.org/abs/2403.03507)** optimizer. See [examples](examples/README.md) for usage. + +[24/03/07] We integrated **[vLLM](https://github.com/vllm-project/vllm)** for faster and concurrent inference. Try `infer_backend: vllm` to enjoy **270%** inference speed. + +[24/02/28] We supported weight-decomposed LoRA (**[DoRA](https://arxiv.org/abs/2402.09353)**). Try `use_dora: true` to activate DoRA training. + +[24/02/15] We supported **block expansion** proposed by [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro). See [examples](examples/README.md) for usage. + +[24/02/05] Qwen1.5 (Qwen2 beta version) series models are supported in LLaMA-Factory. Check this [blog post](https://qwenlm.github.io/blog/qwen1.5/) for details. + +[24/01/18] We supported **agent tuning** for most models, equipping model with tool using abilities by fine-tuning with `dataset: glaive_toolcall_en`. + +[23/12/23] We supported **[unsloth](https://github.com/unslothai/unsloth)**'s implementation to boost LoRA tuning for the LLaMA, Mistral and Yi models. Try `use_unsloth: true` argument to activate unsloth patch. It achieves **170%** speed in our benchmark, check [this page](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison) for details. + +[23/12/12] We supported fine-tuning the latest MoE model **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)** in our framework. See hardware requirement [here](#hardware-requirement). + +[23/12/01] We supported downloading pre-trained models and datasets from the **[ModelScope Hub](https://modelscope.cn/models)**. See [this tutorial](#download-from-modelscope-hub) for usage. + +[23/10/21] We supported **[NEFTune](https://arxiv.org/abs/2310.05914)** trick for fine-tuning. Try `neftune_noise_alpha: 5` argument to activate NEFTune. + +[23/09/27] We supported **$S^2$-Attn** proposed by [LongLoRA](https://github.com/dvlab-research/LongLoRA) for the LLaMA models. Try `shift_attn: true` argument to enable shift short attention. + +[23/09/23] We integrated MMLU, C-Eval and CMMLU benchmarks in this repo. See [examples](examples/README.md) for usage. + +[23/09/10] We supported **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**. Try `flash_attn: fa2` argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs. + +[23/08/12] We supported **RoPE scaling** to extend the context length of the LLaMA models. Try `rope_scaling: linear` argument in training and `rope_scaling: dynamic` argument at inference to extrapolate the position embeddings. + +[23/08/11] We supported **[DPO training](https://arxiv.org/abs/2305.18290)** for instruction-tuned models. See [examples](examples/README.md) for usage. + +[23/07/31] We supported **dataset streaming**. Try `streaming: true` and `max_steps: 10000` arguments to load your dataset in streaming mode. + +[23/07/29] We released two instruction-tuned 13B models at Hugging Face. See these Hugging Face Repos ([LLaMA-2](https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat) / [Baichuan](https://huggingface.co/hiyouga/Baichuan-13B-sft)) for details. + +[23/07/18] We developed an **all-in-one Web UI** for training, evaluation and inference. Try `train_web.py` to fine-tune models in your Web browser. Thank [@KanadeSiina](https://github.com/KanadeSiina) and [@codemayq](https://github.com/codemayq) for their efforts in the development. + +[23/07/09] We released **[FastEdit](https://github.com/hiyouga/FastEdit)** ⚡🩹, an easy-to-use package for editing the factual knowledge of large language models efficiently. Please follow [FastEdit](https://github.com/hiyouga/FastEdit) if you are interested. + +[23/06/29] We provided a **reproducible example** of training a chat model using instruction-following datasets, see [Baichuan-7B-sft](https://huggingface.co/hiyouga/Baichuan-7B-sft) for details. + +[23/06/22] We aligned the [demo API](src/api_demo.py) with the [OpenAI's](https://platform.openai.com/docs/api-reference/chat) format where you can insert the fine-tuned model in **arbitrary ChatGPT-based applications**. + +[23/06/03] We supported quantized training and inference (aka **[QLoRA](https://github.com/artidoro/qlora)**). See [examples](examples/README.md) for usage. + +
+ +> [!TIP] +> If you cannot use the latest feature, please pull the latest code and install LLaMA-Factory again. + +## Supported Models + +| Model | Model size | Template | +| ----------------------------------------------------------------- | -------------------------------- | ------------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | +| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [Gemma 3](https://huggingface.co/google) | 1B/4B/12B/27B | gemma3/gemma (1B) | +| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/THUDM) | 9B/32B | glm4/glmz1 | +| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | +| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | +| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan | +| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | +| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | +| [InternVL 2.5-3](https://huggingface.co/OpenGVLab) | 1B/2B/8B/14B/38B/78B | intern_vl | +| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 | +| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | +| [MiniCPM](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | +| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | +| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | +| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | +| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen3 (MoE)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/235B | qwen3 | +| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | +| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni | +| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | +| [Seed Coder](https://huggingface.co/ByteDance-Seed) | 8B | seed_coder | +| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | + +> [!NOTE] +> For the "base" models, the `template` argument can be chosen from `default`, `alpaca`, `vicuna` etc. But make sure to use the **corresponding template** for the "instruct/chat" models. +> +> Remember to use the **SAME** template in training and inference. +> +> \*: You should install the `transformers` from main branch and use `DISABLE_VERSION_CHECK=1` to skip version check. +> +> \*\*: You need to install a specific version of `transformers` to use the corresponding model. + +Please refer to [constants.py](src/llamafactory/extras/constants.py) for a full list of models we supported. + +You also can add a custom chat template to [template.py](src/llamafactory/data/template.py). + +## Supported Training Approaches + +| Approach | Full-tuning | Freeze-tuning | LoRA | QLoRA | +| ---------------------- | ------------------ | ------------------ | ------------------ | ------------------ | +| Pre-Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| Supervised Fine-Tuning | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| Reward Modeling | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| PPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| DPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| KTO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| ORPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| SimPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | + +> [!TIP] +> The implementation details of PPO can be found in [this blog](https://newfacade.github.io/notes-on-reinforcement-learning/17-ppo-trl.html). + +## Provided Datasets + +
Pre-training datasets + +- [Wiki Demo (en)](data/wiki_demo.txt) +- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) +- [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2) +- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220) +- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered) +- [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile) +- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B) +- [FineWeb (en)](https://huggingface.co/datasets/HuggingFaceFW/fineweb) +- [FineWeb-Edu (en)](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu) +- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack) +- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata) + +
+ +
Supervised fine-tuning datasets + +- [Identity (en&zh)](data/identity.json) +- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) +- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) +- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) +- [Glaive Function Calling V2 (en&zh)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2) +- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima) +- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset) +- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN) +- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN) +- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN) +- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M) +- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) +- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) +- [UltraChat (en)](https://github.com/thunlp/UltraChat) +- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus) +- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k) +- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) +- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca) +- [SlimOrca (en)](https://huggingface.co/datasets/Open-Orca/SlimOrca) +- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct) +- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) +- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa) +- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa) +- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn) +- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) +- [deepctrl (en&zh)](https://www.modelscope.cn/datasets/deepctrl/deepctrl-sft-data) +- [Advertise Generating (zh)](https://huggingface.co/datasets/HasturOfficial/adgen) +- [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k) +- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4) +- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) +- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct) +- [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m) +- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k) +- [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia) +- [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction) +- [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo) +- [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2) +- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered) +- [Magpie-ultra-v0.1 (en)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1) +- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub) +- [OpenO1-SFT (en&zh)](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT) +- [Open-Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) +- [Open-R1-Math (en)](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) +- [Chinese-DeepSeek-R1-Distill (zh)](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT) +- [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k) +- [Pokemon-gpt4o-captions (en&zh)](https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions) +- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de) +- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de) +- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de) +- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de) +- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de) +- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de) +- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de) +- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de) +- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de) + +
+ +
Preference datasets + +- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k) +- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized) +- [COIG-P (zh)](https://huggingface.co/datasets/m-a-p/COIG-P) +- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset) +- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback) +- [RLAIF-V (en)](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) +- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs) +- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf) +- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) +- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de) +- [KTO mixed (en)](https://huggingface.co/datasets/argilla/kto-mix-15k) + +
+ +Some datasets require confirmation before using them, so we recommend logging in with your Hugging Face account using these commands. + +```bash +pip install --upgrade huggingface_hub +huggingface-cli login +``` + +## Requirement + +| Mandatory | Minimum | Recommend | +| ------------ | ------- | --------- | +| python | 3.9 | 3.10 | +| torch | 2.0.0 | 2.6.0 | +| torchvision | 0.15.0 | 0.21.0 | +| transformers | 4.45.0 | 4.50.0 | +| datasets | 2.16.0 | 3.2.0 | +| accelerate | 0.34.0 | 1.2.1 | +| peft | 0.14.0 | 0.15.1 | +| trl | 0.8.6 | 0.9.6 | + +| Optional | Minimum | Recommend | +| ------------ | ------- | --------- | +| CUDA | 11.6 | 12.2 | +| deepspeed | 0.10.0 | 0.16.4 | +| bitsandbytes | 0.39.0 | 0.43.1 | +| vllm | 0.4.3 | 0.8.2 | +| flash-attn | 2.5.6 | 2.7.2 | + +### Hardware Requirement + +\* *estimated* + +| Method | Bits | 7B | 14B | 30B | 70B | `x`B | +| ------------------------------- | ---- | ----- | ----- | ----- | ------ | ------- | +| Full (`bf16` or `fp16`) | 32 | 120GB | 240GB | 600GB | 1200GB | `18x`GB | +| Full (`pure_bf16`) | 16 | 60GB | 120GB | 300GB | 600GB | `8x`GB | +| Freeze/LoRA/GaLore/APOLLO/BAdam | 16 | 16GB | 32GB | 64GB | 160GB | `2x`GB | +| QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | `x`GB | +| QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | `x/2`GB | +| QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | `x/4`GB | + +## Getting Started + +### Installation + +> [!IMPORTANT] +> Installation is mandatory. + +#### Install from Source + +```bash +git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git +cd LLaMA-Factory +pip install -e ".[torch,metrics]" --no-build-isolation +``` + +Extra dependencies available: torch, torch-npu, metrics, deepspeed, liger-kernel, bitsandbytes, hqq, eetq, gptq, aqlm, vllm, sglang, galore, apollo, badam, adam-mini, qwen, minicpm_v, modelscope, openmind, swanlab, dev + +#### Install from Docker Image + +```bash +docker run -it --rm --gpus=all --ipc=host hiyouga/llamafactory:latest +``` + +This image is built on Ubuntu 22.04 (x86\_64), CUDA 12.4, Python 3.11, PyTorch 2.6.0, and Flash-attn 2.7.4. + +Find the pre-built images: https://hub.docker.com/r/hiyouga/llamafactory/tags + +Please refer to [build docker](#build-docker) to build the image yourself. + +
Setting up a virtual environment with uv + +Create an isolated Python environment with [uv](https://github.com/astral-sh/uv): + +```bash +uv sync --extra torch --extra metrics --prerelease=allow +``` + +Run LLaMA-Factory in the isolated environment: + +```bash +uv run --prerelease=allow llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml +``` + +
+ +
For Windows users + +#### Install PyTorch + +You need to manually install the GPU version of PyTorch on the Windows platform. Please refer to the [official website](https://pytorch.org/get-started/locally/) and the following command to install PyTorch with CUDA support: + +```bash +pip uninstall torch torchvision torchaudio +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 +python -c "import torch; print(torch.cuda.is_available())" +``` + +If you see `True` then you have successfully installed PyTorch with CUDA support. + +Try `dataloader_num_workers: 0` if you encounter `Can't pickle local object` error. + +#### Install BitsAndBytes + +If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you need to install a pre-built version of `bitsandbytes` library, which supports CUDA 11.1 to 12.2, please select the appropriate [release version](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) based on your CUDA version. + +```bash +pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl +``` + +#### Install Flash Attention-2 + +To enable FlashAttention-2 on the Windows platform, please use the script from [flash-attention-windows-wheel](https://huggingface.co/lldacing/flash-attention-windows-wheel) to compile and install it by yourself. + +
+ +
For Ascend NPU users + +To install LLaMA Factory on Ascend NPU devices, please upgrade Python to version 3.10 or higher and specify extra dependencies: `pip install -e ".[torch-npu,metrics]"`. Additionally, you need to install the **[Ascend CANN Toolkit and Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**. Please follow the [installation tutorial](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html) or use the following commands: + +```bash +# replace the url according to your CANN version and devices +# install CANN Toolkit +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C20SPC702/Ascend-cann-toolkit_8.0.0.alpha002_linux-"$(uname -i)".run +bash Ascend-cann-toolkit_8.0.0.alpha002_linux-"$(uname -i)".run --install + +# install CANN Kernels +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C20SPC702/Ascend-cann-kernels-910b_8.0.0.alpha002_linux-"$(uname -i)".run +bash Ascend-cann-kernels-910b_8.0.0.alpha002_linux-"$(uname -i)".run --install + +# set env variables +source /usr/local/Ascend/ascend-toolkit/set_env.sh +``` + +| Requirement | Minimum | Recommend | +| ------------ | ------- | -------------- | +| CANN | 8.0.RC1 | 8.0.0.alpha002 | +| torch | 2.1.0 | 2.4.0 | +| torch-npu | 2.1.0 | 2.4.0.post2 | +| deepspeed | 0.13.2 | 0.13.2 | +| vllm-ascend | - | 0.7.3 | + +Remember to use `ASCEND_RT_VISIBLE_DEVICES` instead of `CUDA_VISIBLE_DEVICES` to specify the device to use. + +If you cannot infer model on NPU devices, try setting `do_sample: false` in the configurations. + +Download the pre-built Docker images: [32GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html) | [64GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html) + +#### Install BitsAndBytes + +To use QLoRA based on bitsandbytes on Ascend NPU, please follow these 3 steps: + +1. Manually compile bitsandbytes: Refer to [the installation documentation](https://huggingface.co/docs/bitsandbytes/installation?backend=Ascend+NPU&platform=Ascend+NPU) for the NPU version of bitsandbytes to complete the compilation and installation. The compilation requires a cmake version of at least 3.22.1 and a g++ version of at least 12.x. + +```bash +# Install bitsandbytes from source +# Clone bitsandbytes repo, Ascend NPU backend is currently enabled on multi-backend-refactor branch +git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git +cd bitsandbytes/ + +# Install dependencies +pip install -r requirements-dev.txt + +# Install the dependencies for the compilation tools. Note that the commands for this step may vary depending on the operating system. The following are provided for reference +apt-get install -y build-essential cmake + +# Compile & install +cmake -DCOMPUTE_BACKEND=npu -S . +make +pip install . +``` + +2. Install transformers from the main branch. + +```bash +git clone -b main https://github.com/huggingface/transformers.git +cd transformers +pip install . +``` + +3. Set `double_quantization: false` in the configuration. You can refer to the [example](examples/train_qlora/llama3_lora_sft_bnb_npu.yaml). + +
+ +### Data Preparation + +Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can use datasets on HuggingFace / ModelScope / Modelers hub, load the dataset in local disk, or specify a path to s3/gcs cloud storage. + +> [!NOTE] +> Please update `data/dataset_info.json` to use your custom dataset. + +You can also use **[Easy Dataset](https://github.com/ConardLi/easy-dataset)** or **[GraphGen](https://github.com/open-sciencelab/GraphGen)** to create synthetic data for fine-tuning. + +### Quickstart + +Use the following 3 commands to run LoRA **fine-tuning**, **inference** and **merging** of the Llama3-8B-Instruct model, respectively. + +```bash +llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml +llamafactory-cli chat examples/inference/llama3_lora_sft.yaml +llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml +``` + +See [examples/README.md](examples/README.md) for advanced usage (including distributed training). + +> [!TIP] +> Use `llamafactory-cli help` to show help information. +> +> Read [FAQs](https://github.com/hiyouga/LLaMA-Factory/issues/4614) first if you encounter any problems. + +### Fine-Tuning with LLaMA Board GUI (powered by [Gradio](https://github.com/gradio-app/gradio)) + +```bash +llamafactory-cli webui +``` + +### Build Docker + +For CUDA users: + +```bash +cd docker/docker-cuda/ +docker compose up -d +docker compose exec llamafactory bash +``` + +For Ascend NPU users: + +```bash +cd docker/docker-npu/ +docker compose up -d +docker compose exec llamafactory bash +``` + +For AMD ROCm users: + +```bash +cd docker/docker-rocm/ +docker compose up -d +docker compose exec llamafactory bash +``` + +
Build without Docker Compose + +For CUDA users: + +```bash +docker build -f ./docker/docker-cuda/Dockerfile \ + --build-arg PIP_INDEX=https://pypi.org/simple \ + --build-arg EXTRAS=metrics \ + -t llamafactory:latest . + +docker run -dit --ipc=host --gpus=all \ + -p 7860:7860 \ + -p 8000:8000 \ + --name llamafactory \ + llamafactory:latest + +docker exec -it llamafactory bash +``` + +For Ascend NPU users: + +```bash +docker build -f ./docker/docker-npu/Dockerfile \ + --build-arg PIP_INDEX=https://pypi.org/simple \ + --build-arg EXTRAS=torch-npu,metrics \ + -t llamafactory:latest . + +docker run -dit --ipc=host \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -p 7860:7860 \ + -p 8000:8000 \ + --device /dev/davinci0 \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + --name llamafactory \ + llamafactory:latest + +docker exec -it llamafactory bash +``` + +For AMD ROCm users: + +```bash +docker build -f ./docker/docker-rocm/Dockerfile \ + --build-arg PIP_INDEX=https://pypi.org/simple \ + --build-arg EXTRAS=metrics \ + -t llamafactory:latest . + +docker run -dit --ipc=host \ + -p 7860:7860 \ + -p 8000:8000 \ + --device /dev/kfd \ + --device /dev/dri \ + --name llamafactory \ + llamafactory:latest + +docker exec -it llamafactory bash +``` + +
+ +
Use Docker volumes + +You can uncomment `VOLUME [ "/root/.cache/huggingface", "/app/shared_data", "/app/output" ]` in the Dockerfile to use data volumes. + +When building the Docker image, use `-v ./hf_cache:/root/.cache/huggingface` argument to mount the local directory to the container. The following data volumes are available. + +- `hf_cache`: Utilize Hugging Face cache on the host machine. +- `shared_data`: The directionary to store datasets on the host machine. +- `output`: Set export dir to this location so that the merged result can be accessed directly on the host machine. + +
+ +### Deploy with OpenAI-style API and vLLM + +```bash +API_PORT=8000 llamafactory-cli api examples/inference/llama3.yaml infer_backend=vllm vllm_enforce_eager=true +``` + +> [!TIP] +> Visit [this page](https://platform.openai.com/docs/api-reference/chat/create) for API document. +> +> Examples: [Image understanding](scripts/api_example/test_image.py) | [Function calling](scripts/api_example/test_toolcall.py) + +### Download from ModelScope Hub + +If you have trouble with downloading models and datasets from Hugging Face, you can use ModelScope. + +```bash +export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows +``` + +Train the model by specifying a model ID of the ModelScope Hub as the `model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `LLM-Research/Meta-Llama-3-8B-Instruct`. + +### Download from Modelers Hub + +You can also use Modelers Hub to download models and datasets. + +```bash +export USE_OPENMIND_HUB=1 # `set USE_OPENMIND_HUB=1` for Windows +``` + +Train the model by specifying a model ID of the Modelers Hub as the `model_name_or_path`. You can find a full list of model IDs at [Modelers Hub](https://modelers.cn/models), e.g., `TeleAI/TeleChat-7B-pt`. + +### Use W&B Logger + +To use [Weights & Biases](https://wandb.ai) for logging experimental results, you need to add the following arguments to yaml files. + +```yaml +report_to: wandb +run_name: test_run # optional +``` + +Set `WANDB_API_KEY` to [your key](https://wandb.ai/authorize) when launching training tasks to log in with your W&B account. + +### Use SwanLab Logger + +To use [SwanLab](https://github.com/SwanHubX/SwanLab) for logging experimental results, you need to add the following arguments to yaml files. + +```yaml +use_swanlab: true +swanlab_run_name: test_run # optional +``` + +When launching training tasks, you can log in to SwanLab in three ways: + +1. Add `swanlab_api_key=` to the yaml file, and set it to your [API key](https://swanlab.cn/settings). +2. Set the environment variable `SWANLAB_API_KEY` to your [API key](https://swanlab.cn/settings). +3. Use the `swanlab login` command to complete the login. + +## Projects using LLaMA Factory + +If you have a project that should be incorporated, please contact via email or create a pull request. + +
Click to show + +1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223) +1. Yu et al. Open, Closed, or Small Language Models for Text Classification? 2023. [[arxiv]](https://arxiv.org/abs/2308.10092) +1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526) +1. Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. [[arxiv]](https://arxiv.org/abs/2311.07816) +1. Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. [[arxiv]](https://arxiv.org/abs/2312.15710) +1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. KDD 2024. [[arxiv]](https://arxiv.org/abs/2401.04319) +1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2401.07286) +1. Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2402.05904) +1. Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. [[arxiv]](https://arxiv.org/abs/2402.07625) +1. Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11176) +1. Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. [[arxiv]](https://arxiv.org/abs/2402.11187) +1. Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. [[arxiv]](https://arxiv.org/abs/2402.11746) +1. Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11801) +1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2402.11809) +1. Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11819) +1. Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. [[arxiv]](https://arxiv.org/abs/2402.12204) +1. Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.14714) +1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. ACL 2024. [[arxiv]](https://arxiv.org/abs/2402.15043) +1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333) +1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419) +1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228) +1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073) +1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541) +1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246) +1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. COLING 2024. [[arxiv]](https://arxiv.org/abs/2403.16008) +1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443) +1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604) +1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827) +1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167) +1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. ICML 2024. [[arxiv]](https://arxiv.org/abs/2404.04316) +1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084) +1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836) +1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581) +1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215) +1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621) +1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2404.17140) +1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. NAACL 2024. [[arxiv]](https://arxiv.org/abs/2404.18585) +1. Xu et al. Large Language Models for Cyber Security: A Systematic Literature Review. 2024. [[arxiv]](https://arxiv.org/abs/2405.04760) +1. Dammu et al. "They are uncultured": Unveiling Covert Harms and Social Threats in LLM Generated Conversations. 2024. [[arxiv]](https://arxiv.org/abs/2405.05378) +1. Yi et al. A safety realignment framework via subspace-oriented model fusion for large language models. 2024. [[arxiv]](https://arxiv.org/abs/2405.09055) +1. Lou et al. SPO: Multi-Dimensional Preference Sequential Alignment With Implicit Reward Modeling. 2024. [[arxiv]](https://arxiv.org/abs/2405.12739) +1. Zhang et al. Getting More from Less: Large Language Models are Good Spontaneous Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2405.13816) +1. Zhang et al. TS-Align: A Teacher-Student Collaborative Framework for Scalable Iterative Finetuning of Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2405.20215) +1. Zihong Chen. Sentence Segmentation and Sentence Punctuation Based on XunziALLM. 2024. [[paper]](https://aclanthology.org/2024.lt4hala-1.30) +1. Gao et al. The Best of Both Worlds: Toward an Honest and Helpful Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2406.00380) +1. Wang and Song. MARS: Benchmarking the Metaphysical Reasoning Abilities of Language Models with a Multi-task Evaluation Dataset. 2024. [[arxiv]](https://arxiv.org/abs/2406.02106) +1. Hu et al. Computational Limits of Low-Rank Adaptation (LoRA) for Transformer-Based Models. 2024. [[arxiv]](https://arxiv.org/abs/2406.03136) +1. Ge et al. Time Sensitive Knowledge Editing through Efficient Finetuning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2406.04496) +1. Tan et al. Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions. 2024. [[arxiv]](https://arxiv.org/abs/2406.05688) +1. Song et al. Turbo Sparse: Achieving LLM SOTA Performance with Minimal Activated Parameters. 2024. [[arxiv]](https://arxiv.org/abs/2406.05955) +1. Gu et al. RWKV-CLIP: A Robust Vision-Language Representation Learner. 2024. [[arxiv]](https://arxiv.org/abs/2406.06973) +1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115) +1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815) +1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099) +1. Ding et al. IntentionQA: A Benchmark for Evaluating Purchase Intention Comprehension Abilities of Language Models in E-commerce. 2024. [[arxiv]](https://arxiv.org/abs/2406.10173) +1. He et al. COMMUNITY-CROSS-INSTRUCT: Unsupervised Instruction Generation for Aligning Large Language Models to Online Communities. 2024. [[arxiv]](https://arxiv.org/abs/2406.12074) +1. Lin et al. FVEL: Interactive Formal Verification Environment with Large Language Models via Theorem Proving. 2024. [[arxiv]](https://arxiv.org/abs/2406.14408) +1. Treutlein et al. Connecting the Dots: LLMs can Infer and Verbalize Latent Structure from Disparate Training Data. 2024. [[arxiv]](https://arxiv.org/abs/2406.14546) +1. Feng et al. SS-Bench: A Benchmark for Social Story Generation and Evaluation. 2024. [[arxiv]](https://arxiv.org/abs/2406.15695) +1. Feng et al. Self-Constructed Context Decompilation with Fined-grained Alignment Enhancement. 2024. [[arxiv]](https://arxiv.org/abs/2406.17233) +1. Liu et al. Large Language Models for Cuffless Blood Pressure Measurement From Wearable Biosignals. 2024. [[arxiv]](https://arxiv.org/abs/2406.18069) +1. Iyer et al. Exploring Very Low-Resource Translation with LLMs: The University of Edinburgh's Submission to AmericasNLP 2024 Translation Task. AmericasNLP 2024. [[paper]](https://aclanthology.org/2024.americasnlp-1.25) +1. Li et al. Calibrating LLMs with Preference Optimization on Thought Trees for Generating Rationale in Science Question Scoring. 2024. [[arxiv]](https://arxiv.org/abs/2406.19949) +1. Yang et al. Financial Knowledge Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2407.00365) +1. Lin et al. DogeRM: Equipping Reward Models with Domain Knowledge through Model Merging. 2024. [[arxiv]](https://arxiv.org/abs/2407.01470) +1. Bako et al. Evaluating the Semantic Profiling Abilities of LLMs for Natural Language Utterances in Data Visualization. 2024. [[arxiv]](https://arxiv.org/abs/2407.06129) +1. Huang et al. RoLoRA: Fine-tuning Rotated Outlier-free LLMs for Effective Weight-Activation Quantization. 2024. [[arxiv]](https://arxiv.org/abs/2407.08044) +1. Jiang et al. LLM-Collaboration on Automatic Science Journalism for the General Audience. 2024. [[arxiv]](https://arxiv.org/abs/2407.09756) +1. Inouye et al. Applied Auto-tuning on LoRA Hyperparameters. 2024. [[paper]](https://scholarcommons.scu.edu/cseng_senior/272/) +1. Qi et al. Research on Tibetan Tourism Viewpoints information generation system based on LLM. 2024. [[arxiv]](https://arxiv.org/abs/2407.13561) +1. Xu et al. Course-Correction: Safety Alignment Using Synthetic Preferences. 2024. [[arxiv]](https://arxiv.org/abs/2407.16637) +1. Sun et al. LAMBDA: A Large Model Based Data Agent. 2024. [[arxiv]](https://arxiv.org/abs/2407.17535) +1. Zhu et al. CollectiveSFT: Scaling Large Language Models for Chinese Medical Benchmark with Collective Instructions in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2407.19705) +1. Yu et al. Correcting Negative Bias in Large Language Models through Negative Attention Score Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2408.00137) +1. Xie et al. The Power of Personalized Datasets: Advancing Chinese Composition Writing for Elementary School through Targeted Model Fine-Tuning. IALP 2024. [[paper]](https://www.asianlp.sg/conferences/ialp2024/proceedings/papers/IALP2024_P055.pdf) +1. Liu et al. Instruct-Code-Llama: Improving Capabilities of Language Model in Competition Level Code Generation by Online Judge Feedback. ICIC 2024. [[paper]](https://link.springer.com/chapter/10.1007/978-981-97-5669-8_11) +1. Wang et al. Cybernetic Sentinels: Unveiling the Impact of Safety Data Selection on Model Security in Supervised Fine-Tuning. ICIC 2024. [[paper]](https://link.springer.com/chapter/10.1007/978-981-97-5669-8_23) +1. Xia et al. Understanding the Performance and Estimating the Cost of LLM Fine-Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2408.04693) +1. Zeng et al. Perceive, Reflect, and Plan: Designing LLM Agent for Goal-Directed City Navigation without Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2408.04168) +1. Xia et al. Using Pre-trained Language Model for Accurate ESG Prediction. FinNLP 2024. [[paper]](https://aclanthology.org/2024.finnlp-2.1/) +1. Liang et al. I-SHEEP: Self-Alignment of LLM from Scratch through an Iterative Self-Enhancement Paradigm. 2024. [[arxiv]](https://arxiv.org/abs/2408.08072) +1. Bai et al. Aligning Large Language Model with Direct Multi-Preference Optimization for Recommendation. CIKM 2024. [[paper]](https://dl.acm.org/doi/10.1145/3627673.3679611) +1. Zhang et al. CPsyCoun: A Report-based Multi-turn Dialogue Reconstruction and Evaluation Framework for Chinese Psychological Counseling. ACL 2024. [[paper]](https://aclanthology.org/2024.findings-acl.830.pdf) +1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B. +1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge. +1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B. +1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: A series of large language models for Chinese medical domain, based on LLaMA2-7B and Baichuan-13B. +1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods. +1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**: A large language model specialized in generate metadata for stable diffusion. [[demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt) +1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**: A multimodal large language model specialized in Chinese medical domain, based on LLaVA-1.5-7B. +1. **[AutoRE](https://github.com/THUDM/AutoRE)**: A document-level relation extraction system based on large language models. +1. **[NVIDIA RTX AI Toolkit](https://github.com/NVIDIA/RTX-AI-Toolkit)**: SDKs for fine-tuning LLMs on Windows PC for NVIDIA RTX. +1. **[LazyLLM](https://github.com/LazyAGI/LazyLLM)**: An easy and lazy way for building multi-agent LLMs applications and supports model fine-tuning via LLaMA Factory. +1. **[RAG-Retrieval](https://github.com/NLPJCL/RAG-Retrieval)**: A full pipeline for RAG retrieval model fine-tuning, inference, and distillation. [[blog]](https://zhuanlan.zhihu.com/p/987727357) +1. **[360-LLaMA-Factory](https://github.com/Qihoo360/360-LLaMA-Factory)**: A modified library that supports long sequence SFT & DPO using ring attention. +1. **[Sky-T1](https://novasky-ai.github.io/posts/sky-t1/)**: An o1-like model fine-tuned by NovaSky AI with very small cost. +1. **[WeClone](https://github.com/xming521/WeClone)**: One-stop solution for creating your digital avatar from chat logs. +1. **[EmoLLM](https://github.com/SmartFlowAI/EmoLLM)**: A project about large language models (LLMs) and mental health. +
+ +## License + +This repository is licensed under the [Apache-2.0 License](LICENSE). + +Please follow the model licenses to use the corresponding model weights: [Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [GPT-2](https://github.com/openai/gpt-2/blob/master/LICENSE) / [Granite](LICENSE) / [Index](https://huggingface.co/IndexTeam/Index-1.9B/blob/main/LICENSE) / [InternLM](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3/Phi-4](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [Skywork](https://huggingface.co/Skywork/Skywork-13B-base/blob/main/Skywork%20Community%20License.pdf) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [TeleChat2](https://huggingface.co/Tele-AI/telechat-7B/blob/main/TeleChat%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) + +## Citation + +If this work is helpful, please kindly cite as: + +```bibtex +@inproceedings{zheng2024llamafactory, + title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models}, + author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma}, + booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)}, + address={Bangkok, Thailand}, + publisher={Association for Computational Linguistics}, + year={2024}, + url={http://arxiv.org/abs/2403.13372} +} +``` + +## Acknowledgement + +This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https://github.com/huggingface/trl), [QLoRA](https://github.com/artidoro/qlora) and [FastChat](https://github.com/lm-sys/FastChat). Thanks for their wonderful works. + +## Star History + +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) diff --git a/Model/Model/LLaMA-Factory/README_zh.md b/Model/Model/LLaMA-Factory/README_zh.md new file mode 100644 index 0000000000000000000000000000000000000000..216d9b6ce0ffbcff8aaaf8dd34e9f5b3e1301abe --- /dev/null +++ b/Model/Model/LLaMA-Factory/README_zh.md @@ -0,0 +1,943 @@ +![# LLaMA Factory](assets/logo.png) + +[![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/LLaMA-Factory?style=social)](https://github.com/hiyouga/LLaMA-Factory/stargazers) +[![GitHub last commit](https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory)](https://github.com/hiyouga/LLaMA-Factory/commits/main) +[![GitHub contributors](https://img.shields.io/github/contributors/hiyouga/LLaMA-Factory?color=orange)](https://github.com/hiyouga/LLaMA-Factory/graphs/contributors) +[![GitHub workflow](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml/badge.svg)](https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml) +[![PyPI](https://img.shields.io/pypi/v/llamafactory)](https://pypi.org/project/llamafactory/) +[![Citation](https://img.shields.io/badge/citation-561-green)](https://scholar.google.com/scholar?cites=12620864006390196564) +[![Docker Pulls](https://img.shields.io/docker/pulls/hiyouga/llamafactory)](https://hub.docker.com/r/hiyouga/llamafactory/tags) + +[![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai) +[![Discord](https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&style=flat)](https://discord.gg/rKfvV9r9FK) +[![GitCode](https://gitcode.com/zhengyaowei/LLaMA-Factory/star/badge.svg)](https://gitcode.com/zhengyaowei/LLaMA-Factory) + +[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing) +[![Open in DSW](https://gallery.pai-ml.com/assets/open-in-dsw.svg)](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory) +[![Open in Alaya](assets/alaya_new.svg)](https://docs.alayanew.com/docs/documents/newActivities/llamafactory/?utm_source=LLaMA-Factory) +[![Open in Spaces](https://img.shields.io/badge/🤗-Open%20in%20Spaces-blue)](https://huggingface.co/spaces/hiyouga/LLaMA-Board) +[![Open in Studios](https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue)](https://modelscope.cn/studios/hiyouga/LLaMA-Board) +[![Open in Novita](https://img.shields.io/badge/Novita-Deploy%20Template-blue)](https://novita.ai/templates-library/105981?sharer=88115474-394e-4bda-968e-b88e123d0c47) + +### 获得[亚马逊](https://aws.amazon.com/cn/blogs/china/a-one-stop-code-free-model-fine-tuning-deployment-platform-based-on-sagemaker-and-llama-factory/)、[英伟达](https://developer.nvidia.cn/rtx/ai-toolkit)、[阿里云](https://help.aliyun.com/zh/pai/use-cases/fine-tune-a-llama-3-model-with-llama-factory)等的应用。 + +
+ +### 赞助商 ❤️ + + + Warp sponsorship + + +#### [Warp,面向开发者的智能终端](https://warp.dev/llama-factory) + +[适用于 MacOS、Linux 和 Windows](https://warp.dev/llama-factory) + +---- + +### 使用零代码[命令行](#快速开始)与 [Web UI](#llama-board-可视化微调由-gradio-驱动) 轻松微调百余种大模型 + +![GitHub Trend](https://trendshift.io/api/badge/repositories/4535) + +
+ +👋 加入我们的[微信群](assets/wechat.jpg)、[NPU 用户群](assets/wechat_npu.jpg)或 [Alaya NeW 算力优惠群](assets/wechat_alaya.png)。 + +\[ [English](README.md) | 中文 \] + +**微调大模型可以像这样轻松…** + +https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc + +选择你的打开方式: + +- **入门教程**:https://zhuanlan.zhihu.com/p/695287607 +- **框架文档**:https://llamafactory.readthedocs.io/zh-cn/latest/ +- **框架文档(昇腾 NPU)**:https://ascend.github.io/docs/sources/llamafactory/ +- **Colab(免费)**:https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing +- **本地机器**:请见[如何使用](#如何使用) +- **PAI-DSW(免费试用)**:https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory +- **Alaya NeW(算力优惠活动)**:https://docs.alayanew.com/docs/documents/newActivities/llamafactory/?utm_source=LLaMA-Factory + +> [!NOTE] +> 除上述链接以外的其他网站均为未经许可的第三方网站,请小心甄别。 + +## 目录 + +- [项目特色](#项目特色) +- [官方博客](#官方博客) +- [更新日志](#更新日志) +- [模型](#模型) +- [训练方法](#训练方法) +- [数据集](#数据集) +- [软硬件依赖](#软硬件依赖) +- [如何使用](#如何使用) + - [安装 LLaMA Factory](#安装-llama-factory) + - [数据准备](#数据准备) + - [快速开始](#快速开始) + - [LLaMA Board 可视化微调](#llama-board-可视化微调由-gradio-驱动) + - [构建 Docker](#构建-docker) + - [利用 vLLM 部署 OpenAI API](#利用-vllm-部署-openai-api) + - [从魔搭社区下载](#从魔搭社区下载) + - [从魔乐社区下载](#从魔乐社区下载) + - [使用 W&B 面板](#使用-wb-面板) + - [使用 SwanLab 面板](#使用-swanlab-面板) +- [使用了 LLaMA Factory 的项目](#使用了-llama-factory-的项目) +- [协议](#协议) +- [引用](#引用) +- [致谢](#致谢) + +## 项目特色 + +- **多种模型**:LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Qwen2-VL、DeepSeek、Yi、Gemma、ChatGLM、Phi 等等。 +- **集成方法**:(增量)预训练、(多模态)指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练、ORPO 训练等等。 +- **多种精度**:16 比特全参数微调、冻结微调、LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ 的 2/3/4/5/6/8 比特 QLoRA 微调。 +- **先进算法**:[GaLore](https://github.com/jiaweizzhao/GaLore)、[BAdam](https://github.com/Ledzy/BAdam)、[APOLLO](https://github.com/zhuhanqing/APOLLO)、[Adam-mini](https://github.com/zyushun/Adam-mini)、[Muon](https://github.com/KellerJordan/Muon)、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ 和 PiSSA。 +- **实用技巧**:[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)、[Unsloth](https://github.com/unslothai/unsloth)、[Liger Kernel](https://github.com/linkedin/Liger-Kernel)、RoPE scaling、NEFTune 和 rsLoRA。 +- **广泛任务**:多轮对话、工具调用、图像理解、视觉定位、视频识别和语音理解等等。 +- **实验监控**:LlamaBoard、TensorBoard、Wandb、MLflow、[SwanLab](https://github.com/SwanHubX/SwanLab) 等等。 +- **极速推理**:基于 [vLLM](https://github.com/vllm-project/vllm) 或 [SGLang](https://github.com/sgl-project/sglang) 的 OpenAI 风格 API、浏览器界面和命令行接口。 + +### 最新模型的 Day-N 微调适配 + +| 适配时间 | 模型名称 | +| ------------ | ------------------------------------------------------------ | +| Day 0 | Qwen3 / Qwen2.5-VL / Gemma 3 / InternLM 3 / MiniCPM-o-2.6 | +| Day 1 | Llama 3 / GLM-4 / Mistral Small / PaliGemma2 / Llama 4 | + +## 官方博客 + +- [使用 LLaMA-Factory 微调 Qwen2.5-VL 实现自动驾驶场景微调](https://docs.alayanew.com/docs/documents/useGuide/LLaMAFactory/mutiple/?utm_source=LLaMA-Factory)(中文) +- [通过亚马逊 SageMaker HyperPod 上的 LLaMA-Factory 增强多模态模型银行文档的视觉信息提取](https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/)(英文) +- [Easy Dataset × LLaMA Factory: 让大模型高效学习领域知识](https://buaa-act.feishu.cn/wiki/KY9xwTGs1iqHrRkjXBwcZP9WnL9)(中文) + +
全部博客 + +- [LLaMA Factory:微调 DeepSeek-R1-Distill-Qwen-7B 模型实现新闻标题分类器](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_deepseek_r1_distill_7b)(中文) +- [基于 Amazon SageMaker 和 LLaMA-Factory 打造一站式无代码模型微调部署平台 Model Hub](https://aws.amazon.com/cn/blogs/china/a-one-stop-code-free-model-fine-tuning-deployment-platform-based-on-sagemaker-and-llama-factory/)(中文) +- [LLaMA Factory 多模态微调实践:微调 Qwen2-VL 构建文旅大模型](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_qwen2vl)(中文) +- [LLaMA Factory:微调LLaMA3模型实现角色扮演](https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory)(中文) + +
+ +## 更新日志 + +[25/04/28] 我们支持了 **[Qwen3](https://qwenlm.github.io/blog/qwen3/)** 系列模型的微调。 + +[25/04/21] 我们支持了 **[Muon](https://github.com/KellerJordan/Muon)** 优化器。详细用法请参照 [examples](examples/README_zh.md)。感谢 [@tianshijing](https://github.com/tianshijing) 的 PR。 + +[25/04/16] 我们支持了 **[InternVL3](https://huggingface.co/OpenGVLab/InternVL3-8B)** 模型的微调。查看 [PR #7258](https://github.com/hiyouga/LLaMA-Factory/pull/7258) 以使用。 + +[25/04/14] 我们支持了 **[GLM-Z1](https://huggingface.co/THUDM/GLM-Z1-9B-0414)** 和 **[Kimi-VL](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct)** 模型的微调。 + +[25/04/06] 我们支持了 **[Llama 4](https://ai.meta.com/blog/llama-4-multimodal-intelligence/)** 模型的微调。查看 [PR #7611](https://github.com/hiyouga/LLaMA-Factory/pull/7611) 以使用。 + +
展开日志 + +[25/03/31] 我们支持了 **[Qwen2.5 Omni](https://qwenlm.github.io/blog/qwen2.5-omni/)** 模型的微调。查看 [PR #7537](https://github.com/hiyouga/LLaMA-Factory/pull/7537) 以使用。 + +[25/03/15] 我们支持了 **[SGLang](https://github.com/sgl-project/sglang)** 推理后端,请使用 `infer_backend: sglang` 启用。 + +[25/03/12] 我们支持了 **[Gemma 3](https://huggingface.co/blog/gemma3)** 模型的微调。 + +[25/02/24] 我们宣布开源 **[EasyR1](https://github.com/hiyouga/EasyR1)**,一个高效可扩展的多模态强化学习框架,支持高效的 GRPO 训练。 + +[25/02/11] 我们支持了在导出模型时保存 **[Ollama](https://github.com/ollama/ollama)** 配置文件。详细用法请参照 [examples](examples/README_zh.md)。 + +[25/02/05] 我们支持了在语音理解任务上微调 **[Qwen2-Audio](Qwen/Qwen2-Audio-7B-Instruct)** 和 **[MiniCPM-o-2.6](https://huggingface.co/openbmb/MiniCPM-o-2_6)** 模型。 + +[25/01/31] 我们支持了 **[DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)** 和 **[Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)** 模型的微调。 + +[25/01/15] 我们支持了 **[APOLLO](https://arxiv.org/abs/2412.05270)** 优化器。详细用法请参照 [examples](examples/README_zh.md)。 + +[25/01/14] 我们支持了 **[MiniCPM-o-2.6](https://huggingface.co/openbmb/MiniCPM-o-2_6)** 和 **[MiniCPM-V-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6)** 模型的微调。 感谢 [@BUAADreamer](https://github.com/BUAADreamer) 的 PR. + +[25/01/14] 我们支持了 **[InternLM 3](https://huggingface.co/collections/internlm/)** 模型的微调。感谢 [@hhaAndroid](https://github.com/hhaAndroid) 的 PR。 + +[25/01/10] 我们支持了 **[Phi-4](https://huggingface.co/microsoft/phi-4)** 模型的微调。 + +[24/12/21] 我们支持了使用 **[SwanLab](https://github.com/SwanHubX/SwanLab)** 跟踪与可视化实验。详细用法请参考 [此部分](#使用-swanlab-面板)。 + +[24/11/27] 我们支持了 **[Skywork-o1](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)** 模型的微调和 **[OpenO1](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT)** 数据集。 + +[24/10/09] 我们支持了从 **[魔乐社区](https://modelers.cn/models)** 下载预训练模型和数据集。详细用法请参照 [此教程](#从魔乐社区下载)。 + +[24/09/19] 我们支持了 **[Qwen2.5](https://qwenlm.github.io/blog/qwen2.5/)** 模型的微调。 + +[24/08/30] 我们支持了 **[Qwen2-VL](https://qwenlm.github.io/blog/qwen2-vl/)** 模型的微调。感谢 [@simonJJJ](https://github.com/simonJJJ) 的 PR。 + +[24/08/27] 我们支持了 **[Liger Kernel](https://github.com/linkedin/Liger-Kernel)**。请使用 `enable_liger_kernel: true` 来加速训练。 + +[24/08/09] 我们支持了 **[Adam-mini](https://github.com/zyushun/Adam-mini)** 优化器。详细用法请参照 [examples](examples/README_zh.md)。感谢 [@relic-yuexi](https://github.com/relic-yuexi) 的 PR。 + +[24/07/04] 我们支持了[无污染打包训练](https://github.com/MeetKai/functionary/tree/main/functionary/train/packing)。请使用 `neat_packing: true` 参数。感谢 [@chuan298](https://github.com/chuan298) 的 PR。 + +[24/06/16] 我们支持了 **[PiSSA](https://arxiv.org/abs/2404.02948)** 算法。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/06/07] 我们支持了 **[Qwen2](https://qwenlm.github.io/blog/qwen2/)** 和 **[GLM-4](https://github.com/THUDM/GLM-4)** 模型的微调。 + +[24/05/26] 我们支持了 **[SimPO](https://arxiv.org/abs/2405.14734)** 偏好对齐算法。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/05/20] 我们支持了 **PaliGemma** 系列模型的微调。注意 PaliGemma 是预训练模型,你需要使用 `paligemma` 模板进行微调使其获得对话能力。 + +[24/05/18] 我们支持了 **[KTO](https://arxiv.org/abs/2402.01306)** 偏好对齐算法。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/05/14] 我们支持了昇腾 NPU 设备的训练和推理。详情请查阅[安装](#安装-llama-factory)部分。 + +[24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型,详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。 + +[24/04/21] 我们基于 [AstraMindAI 的仓库](https://github.com/astramind-ai/Mixture-of-depths)支持了 **[混合深度训练](https://arxiv.org/abs/2404.02258)**。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/04/16] 我们支持了 **[BAdam](https://arxiv.org/abs/2404.02827)** 优化器。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/04/16] 我们支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的长序列训练(24GB 可训练 Llama-2-7B-56k)。该方法相比 FlashAttention-2 提供了 **117%** 的训练速度和 **50%** 的显存节约。更多数据请见[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。 + +[24/03/31] 我们支持了 **[ORPO](https://arxiv.org/abs/2403.07691)**。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看! + +[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/03/13] 我们支持了 **[LoRA+](https://arxiv.org/abs/2402.12354)**。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/03/07] 我们支持了 **[GaLore](https://arxiv.org/abs/2403.03507)** 优化器。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/03/07] 我们集成了 **[vLLM](https://github.com/vllm-project/vllm)** 以实现极速并发推理。请使用 `infer_backend: vllm` 来获得 **270%** 的推理速度。 + +[24/02/28] 我们支持了 **[DoRA](https://arxiv.org/abs/2402.09353)** 微调。请使用 `use_dora: true` 参数进行 DoRA 微调。 + +[24/02/15] 我们支持了 [LLaMA Pro](https://github.com/TencentARC/LLaMA-Pro) 提出的**块扩展**方法。详细用法请参照 [examples](examples/README_zh.md)。 + +[24/02/05] Qwen1.5(Qwen2 测试版)系列模型已在 LLaMA-Factory 中实现微调支持。详情请查阅该[博客页面](https://qwenlm.github.io/zh/blog/qwen1.5/)。 + +[24/01/18] 我们针对绝大多数模型实现了 **Agent 微调**,微调时指定 `dataset: glaive_toolcall_zh` 即可使模型获得工具调用能力。 + +[23/12/23] 我们针对 LLaMA, Mistral 和 Yi 模型支持了 **[unsloth](https://github.com/unslothai/unsloth)** 的 LoRA 训练加速。请使用 `use_unsloth: true` 参数启用 unsloth 优化。该方法可提供 **170%** 的训练速度,详情请查阅[此页面](https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison)。 + +[23/12/12] 我们支持了微调最新的混合专家模型 **[Mixtral 8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)**。硬件需求请查阅[此处](#硬件依赖)。 + +[23/12/01] 我们支持了从 **[魔搭社区](https://modelscope.cn/models)** 下载预训练模型和数据集。详细用法请参照 [此教程](#从魔搭社区下载)。 + +[23/10/21] 我们支持了 **[NEFTune](https://arxiv.org/abs/2310.05914)** 训练技巧。请使用 `neftune_noise_alpha: 5` 参数启用 NEFTune。 + +[23/09/27] 我们针对 LLaMA 模型支持了 [LongLoRA](https://github.com/dvlab-research/LongLoRA) 提出的 **$S^2$-Attn**。请使用 `shift_attn: true` 参数以启用该功能。 + +[23/09/23] 我们在项目中集成了 MMLU、C-Eval 和 CMMLU 评估集。详细用法请参照 [examples](examples/README_zh.md)。 + +[23/09/10] 我们支持了 **[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)**。如果您使用的是 RTX4090、A100 或 H100 GPU,请使用 `flash_attn: fa2` 参数以启用 FlashAttention-2。 + +[23/08/12] 我们支持了 **RoPE 插值**来扩展 LLaMA 模型的上下文长度。请使用 `rope_scaling: linear` 参数训练模型或使用 `rope_scaling: dynamic` 参数评估模型。 + +[23/08/11] 我们支持了指令模型的 **[DPO 训练](https://arxiv.org/abs/2305.18290)**。详细用法请参照 [examples](examples/README_zh.md)。 + +[23/07/31] 我们支持了**数据流式加载**。请使用 `streaming: true` 和 `max_steps: 10000` 参数来流式加载数据集。 + +[23/07/29] 我们在 Hugging Face 发布了两个 13B 指令微调模型。详细内容请查阅我们的 Hugging Face 项目([LLaMA-2](https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat) / [Baichuan](https://huggingface.co/hiyouga/Baichuan-13B-sft))。 + +[23/07/18] 我们开发了支持训练和测试的**浏览器一体化界面**。请使用 `train_web.py` 在您的浏览器中微调模型。感谢 [@KanadeSiina](https://github.com/KanadeSiina) 和 [@codemayq](https://github.com/codemayq) 在该功能开发中付出的努力。 + +[23/07/09] 我们开源了 **[FastEdit](https://github.com/hiyouga/FastEdit)** ⚡🩹,一个简单易用的、能迅速编辑大模型事实记忆的工具包。如果您感兴趣请关注我们的 [FastEdit](https://github.com/hiyouga/FastEdit) 项目。 + +[23/06/29] 我们提供了一个**可复现的**指令模型微调示例,详细内容请查阅 [Baichuan-7B-sft](https://huggingface.co/hiyouga/Baichuan-7B-sft)。 + +[23/06/22] 我们对齐了[示例 API](src/api_demo.py) 与 [OpenAI API](https://platform.openai.com/docs/api-reference/chat) 的格式,您可以将微调模型接入**任意基于 ChatGPT 的应用**中。 + +[23/06/03] 我们实现了 4 比特的 LoRA 训练(也称 **[QLoRA](https://github.com/artidoro/qlora)**)。详细用法请参照 [examples](examples/README_zh.md)。 + +
+ +> [!TIP] +> 如果您无法使用最新的功能,请尝试重新拉取代码并再次安装 LLaMA-Factory。 + +## 模型 + +| 模型名 | 参数量 | Template | +| ----------------------------------------------------------------- | -------------------------------- | ------------------- | +| [Baichuan 2](https://huggingface.co/baichuan-inc) | 7B/13B | baichuan2 | +| [BLOOM/BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | - | +| [ChatGLM3](https://huggingface.co/THUDM) | 6B | chatglm3 | +| [Command R](https://huggingface.co/CohereForAI) | 35B/104B | cohere | +| [DeepSeek (Code/MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | deepseek | +| [DeepSeek 2.5/3](https://huggingface.co/deepseek-ai) | 236B/671B | deepseek3 | +| [DeepSeek R1 (Distill)](https://huggingface.co/deepseek-ai) | 1.5B/7B/8B/14B/32B/70B/671B | deepseekr1 | +| [Falcon](https://huggingface.co/tiiuae) | 7B/11B/40B/180B | falcon | +| [Gemma/Gemma 2/CodeGemma](https://huggingface.co/google) | 2B/7B/9B/27B | gemma | +| [Gemma 3](https://huggingface.co/google) | 1B/4B/12B/27B | gemma3/gemma (1B) | +| [GLM-4/GLM-4-0414/GLM-Z1](https://huggingface.co/THUDM) | 9B/32B | glm4/glmz1 | +| [GPT-2](https://huggingface.co/openai-community) | 0.1B/0.4B/0.8B/1.5B | - | +| [Granite 3.0-3.3](https://huggingface.co/ibm-granite) | 1B/2B/3B/8B | granite3 | +| [Hunyuan](https://huggingface.co/tencent/) | 7B | hunyuan | +| [Index](https://huggingface.co/IndexTeam) | 1.9B | index | +| [InternLM 2-3](https://huggingface.co/internlm) | 7B/8B/20B | intern2 | +| [InternVL 2.5-3](https://huggingface.co/OpenGVLab) | 1B/2B/8B/14B/38B/78B | intern_vl | +| [Kimi-VL](https://huggingface.co/moonshotai) | 16B | kimi_vl | +| [Llama](https://github.com/facebookresearch/llama) | 7B/13B/33B/65B | - | +| [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | +| [Llama 3-3.3](https://huggingface.co/meta-llama) | 1B/3B/8B/70B | llama3 | +| [Llama 4](https://huggingface.co/meta-llama) | 109B/402B | llama4 | +| [Llama 3.2 Vision](https://huggingface.co/meta-llama) | 11B/90B | mllama | +| [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | llava | +| [LLaVA-NeXT](https://huggingface.co/llava-hf) | 7B/8B/13B/34B/72B/110B | llava_next | +| [LLaVA-NeXT-Video](https://huggingface.co/llava-hf) | 7B/34B | llava_next_video | +| [MiMo](https://huggingface.co/XiaomiMiMo) | 7B | mimo | +| [MiniCPM](https://huggingface.co/openbmb) | 0.5B/1B/2B/4B/8B | cpm/cpm3/cpm4 | +| [MiniCPM-o-2.6/MiniCPM-V-2.6](https://huggingface.co/openbmb) | 8B | minicpm_o/minicpm_v | +| [Ministral/Mistral-Nemo](https://huggingface.co/mistralai) | 8B/12B | ministral | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | +| [Mistral Small](https://huggingface.co/mistralai) | 24B | mistral_small | +| [OLMo](https://huggingface.co/allenai) | 1B/7B | - | +| [PaliGemma/PaliGemma2](https://huggingface.co/google) | 3B/10B/28B | paligemma | +| [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | +| [Phi-3/Phi-3.5](https://huggingface.co/microsoft) | 4B/14B | phi | +| [Phi-3-small](https://huggingface.co/microsoft) | 7B | phi_small | +| [Phi-4](https://huggingface.co/microsoft) | 14B | phi4 | +| [Pixtral](https://huggingface.co/mistralai) | 12B | pixtral | +| [Qwen (1-2.5) (Code/Math/MoE/QwQ)](https://huggingface.co/Qwen) | 0.5B/1.5B/3B/7B/14B/32B/72B/110B | qwen | +| [Qwen3 (MoE)](https://huggingface.co/Qwen) | 0.6B/1.7B/4B/8B/14B/32B/235B | qwen3 | +| [Qwen2-Audio](https://huggingface.co/Qwen) | 7B | qwen2_audio | +| [Qwen2.5-Omni](https://huggingface.co/Qwen) | 3B/7B | qwen2_omni | +| [Qwen2-VL/Qwen2.5-VL/QVQ](https://huggingface.co/Qwen) | 2B/3B/7B/32B/72B | qwen2_vl | +| [Seed Coder](https://huggingface.co/ByteDance-Seed) | 8B | seed_coder | +| [Skywork o1](https://huggingface.co/Skywork) | 8B | skywork_o1 | +| [StarCoder 2](https://huggingface.co/bigcode) | 3B/7B/15B | - | +| [TeleChat2](https://huggingface.co/Tele-AI) | 3B/7B/35B/115B | telechat2 | +| [XVERSE](https://huggingface.co/xverse) | 7B/13B/65B | xverse | +| [Yi/Yi-1.5 (Code)](https://huggingface.co/01-ai) | 1.5B/6B/9B/34B | yi | +| [Yi-VL](https://huggingface.co/01-ai) | 6B/34B | yi_vl | +| [Yuan 2](https://huggingface.co/IEITYuan) | 2B/51B/102B | yuan | + +> [!NOTE] +> 对于所有“基座”(Base)模型,`template` 参数可以是 `default`, `alpaca`, `vicuna` 等任意值。但“对话”(Instruct/Chat)模型请务必使用**对应的模板**。 +> +> 请务必在训练和推理时采用**完全一致**的模板。 +> +> \*:您需要从 main 分支安装 `transformers` 并使用 `DISABLE_VERSION_CHECK=1` 来跳过版本检查。 +> +> \*\*:您需要安装特定版本的 `transformers` 以使用该模型。 + +项目所支持模型的完整列表请参阅 [constants.py](src/llamafactory/extras/constants.py)。 + +您也可以在 [template.py](src/llamafactory/data/template.py) 中添加自己的对话模板。 + +## 训练方法 + +| 方法 | 全参数训练 | 部分参数训练 | LoRA | QLoRA | +| --------------------- | ------------------ | ------------------ | ------------------ | ------------------ | +| 预训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| 指令监督微调 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| 奖励模型训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| PPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| DPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| KTO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| ORPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| SimPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | + +> [!TIP] +> 有关 PPO 的实现细节,请参考[此博客](https://newfacade.github.io/notes-on-reinforcement-learning/17-ppo-trl.html)。 + +## 数据集 + +
预训练数据集 + +- [Wiki Demo (en)](data/wiki_demo.txt) +- [RefinedWeb (en)](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) +- [RedPajama V2 (en)](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2) +- [Wikipedia (en)](https://huggingface.co/datasets/olm/olm-wikipedia-20221220) +- [Wikipedia (zh)](https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered) +- [Pile (en)](https://huggingface.co/datasets/EleutherAI/pile) +- [SkyPile (zh)](https://huggingface.co/datasets/Skywork/SkyPile-150B) +- [FineWeb (en)](https://huggingface.co/datasets/HuggingFaceFW/fineweb) +- [FineWeb-Edu (en)](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu) +- [The Stack (en)](https://huggingface.co/datasets/bigcode/the-stack) +- [StarCoder (en)](https://huggingface.co/datasets/bigcode/starcoderdata) + +
+ +
指令微调数据集 + +- [Identity (en&zh)](data/identity.json) +- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) +- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca-3) +- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) +- [Glaive Function Calling V2 (en&zh)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2) +- [LIMA (en)](https://huggingface.co/datasets/GAIR/lima) +- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset) +- [BELLE 2M (zh)](https://huggingface.co/datasets/BelleGroup/train_2M_CN) +- [BELLE 1M (zh)](https://huggingface.co/datasets/BelleGroup/train_1M_CN) +- [BELLE 0.5M (zh)](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN) +- [BELLE Dialogue 0.4M (zh)](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M) +- [BELLE School Math 0.25M (zh)](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) +- [BELLE Multiturn Chat 0.8M (zh)](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) +- [UltraChat (en)](https://github.com/thunlp/UltraChat) +- [OpenPlatypus (en)](https://huggingface.co/datasets/garage-bAInd/Open-Platypus) +- [CodeAlpaca 20k (en)](https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k) +- [Alpaca CoT (multilingual)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT) +- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca) +- [SlimOrca (en)](https://huggingface.co/datasets/Open-Orca/SlimOrca) +- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct) +- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) +- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa) +- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa) +- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn) +- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) +- [deepctrl (en&zh)](https://www.modelscope.cn/datasets/deepctrl/deepctrl-sft-data) +- [Advertise Generating (zh)](https://huggingface.co/datasets/HasturOfficial/adgen) +- [ShareGPT Hyperfiltered (en)](https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k) +- [ShareGPT4 (en&zh)](https://huggingface.co/datasets/shibing624/sharegpt_gpt4) +- [UltraChat 200k (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) +- [AgentInstruct (en)](https://huggingface.co/datasets/THUDM/AgentInstruct) +- [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m) +- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k) +- [Cosmopedia (en)](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia) +- [STEM (zh)](https://huggingface.co/datasets/hfl/stem_zh_instruction) +- [Ruozhiba (zh)](https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo) +- [Neo-sft (zh)](https://huggingface.co/datasets/m-a-p/neo_sft_phase2) +- [Magpie-Pro-300K-Filtered (en)](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered) +- [Magpie-ultra-v0.1 (en)](https://huggingface.co/datasets/argilla/magpie-ultra-v0.1) +- [WebInstructSub (en)](https://huggingface.co/datasets/TIGER-Lab/WebInstructSub) +- [OpenO1-SFT (en&zh)](https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT) +- [Open-Thoughts (en)](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) +- [Open-R1-Math (en)](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) +- [Chinese-DeepSeek-R1-Distill (zh)](https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT) +- [LLaVA mixed (en&zh)](https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k) +- [Pokemon-gpt4o-captions (en&zh)](https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions) +- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de) +- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de) +- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de) +- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de) +- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de) +- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de) +- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de) +- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de) +- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de) + +
+ +
偏好数据集 + +- [DPO mixed (en&zh)](https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k) +- [UltraFeedback (en)](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized) +- [COIG-P (zh)](https://huggingface.co/datasets/m-a-p/COIG-P) +- [RLHF-V (en)](https://huggingface.co/datasets/openbmb/RLHF-V-Dataset) +- [VLFeedback (en)](https://huggingface.co/datasets/Zhihui/VLFeedback) +- [RLAIF-V (en)](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) +- [Orca DPO Pairs (en)](https://huggingface.co/datasets/Intel/orca_dpo_pairs) +- [HH-RLHF (en)](https://huggingface.co/datasets/Anthropic/hh-rlhf) +- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) +- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de) +- [KTO mixed (en)](https://huggingface.co/datasets/argilla/kto-mix-15k) + +
+ +部分数据集的使用需要确认,我们推荐使用下述命令登录您的 Hugging Face 账户。 + +```bash +pip install --upgrade huggingface_hub +huggingface-cli login +``` + +## 软硬件依赖 + +| 必需项 | 至少 | 推荐 | +| ------------ | ------- | --------- | +| python | 3.9 | 3.10 | +| torch | 2.0.0 | 2.6.0 | +| torchvision | 0.15.0 | 0.21.0 | +| transformers | 4.45.0 | 4.50.0 | +| datasets | 2.16.0 | 3.2.0 | +| accelerate | 0.34.0 | 1.2.1 | +| peft | 0.14.0 | 0.15.1 | +| trl | 0.8.6 | 0.9.6 | + +| 可选项 | 至少 | 推荐 | +| ------------ | ------- | --------- | +| CUDA | 11.6 | 12.2 | +| deepspeed | 0.10.0 | 0.16.4 | +| bitsandbytes | 0.39.0 | 0.43.1 | +| vllm | 0.4.3 | 0.8.2 | +| flash-attn | 2.5.6 | 2.7.2 | + +### 硬件依赖 + +\* *估算值* + +| 方法 | 精度 | 7B | 14B | 30B | 70B | `x`B | +| ------------------------------- | ---- | ----- | ----- | ----- | ------ | ------- | +| Full (`bf16` or `fp16`) | 32 | 120GB | 240GB | 600GB | 1200GB | `18x`GB | +| Full (`pure_bf16`) | 16 | 60GB | 120GB | 300GB | 600GB | `8x`GB | +| Freeze/LoRA/GaLore/APOLLO/BAdam | 16 | 16GB | 32GB | 64GB | 160GB | `2x`GB | +| QLoRA | 8 | 10GB | 20GB | 40GB | 80GB | `x`GB | +| QLoRA | 4 | 6GB | 12GB | 24GB | 48GB | `x/2`GB | +| QLoRA | 2 | 4GB | 8GB | 16GB | 24GB | `x/4`GB | + +## 如何使用 + +### 安装 LLaMA Factory + +> [!IMPORTANT] +> 此步骤为必需。 + +#### 从源码安装 + +```bash +git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git +cd LLaMA-Factory +pip install -e ".[torch,metrics]" --no-build-isolation +``` + +可选的额外依赖项:torch、torch-npu、metrics、deepspeed、liger-kernel、bitsandbytes、hqq、eetq、gptq、aqlm、vllm、sglang、galore、apollo、badam、adam-mini、qwen、minicpm_v、modelscope、openmind、swanlab、dev + +#### 从镜像安装 + +```bash +docker run -it --rm --gpus=all --ipc=host hiyouga/llamafactory:latest +``` + +该镜像基于 Ubuntu 22.04(x86\_64)、CUDA 12.4、Python 3.11、PyTorch 2.6.0 和 Flash-attn 2.7.4 构建。 + +查看全部镜像:https://hub.docker.com/r/hiyouga/llamafactory/tags + +请参阅[构建 Docker](#构建-docker) 来重新构建镜像。 + +
使用 uv 构建虚拟环境 + +使用 [uv](https://github.com/astral-sh/uv) 创建隔离的 Python 环境: + +```bash +uv sync --extra torch --extra metrics --prerelease=allow +``` + +在环境中运行 LLaMA-Factory: + +```bash +uv run --prerelease=allow llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml +``` + +
+ +
Windows 用户指南 + +#### 安装 PyTorch + +Windows 平台需要额外手动安装 GPU 版本的 PyTorch 依赖包,您可以参考[官方网站](https://pytorch.org/get-started/locally/)和以下命令安装并测试 PyTorch 是否正确安装。 + +```bash +pip uninstall torch torchvision torchaudio +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126 +python -c "import torch; print(torch.cuda.is_available())" +``` + +如果看到 `True` 则说明安装成功。 + +若遇到类似 `Can't pickle local object` 的报错,请设置 `dataloader_num_workers: 0`。 + +#### 安装 BitsAndBytes + +如果要在 Windows 平台上开启量化 LoRA(QLoRA),需要安装预编译的 `bitsandbytes` 库, 支持 CUDA 11.1 到 12.2, 请根据您的 CUDA 版本情况选择适合的[发布版本](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。 + +```bash +pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl +``` + +#### 安装 Flash Attention-2 + +如果要在 Windows 平台上开启 FlashAttention-2,请使用 [flash-attention-windows-wheel](https://huggingface.co/lldacing/flash-attention-windows-wheel) 中的脚本自行编译与安装。 + +
+ +
昇腾 NPU 用户指南 + +在昇腾 NPU 设备上安装 LLaMA Factory 时,请升级 Python 到 3.10 及以上,并需要指定额外依赖项,使用 `pip install -e ".[torch-npu,metrics]"` 命令安装。此外,还需要安装 **[Ascend CANN Toolkit 与 Kernels](https://www.hiascend.com/developer/download/community/result?module=cann)**,安装方法请参考[安装教程](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/quickstart/quickstart/quickstart_18_0004.html)或使用以下命令: + +```bash +# 请替换 URL 为 CANN 版本和设备型号对应的 URL +# 安装 CANN Toolkit +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C17SPC701/Ascend-cann-toolkit_8.0.RC1.alpha001_linux-"$(uname -i)".run +bash Ascend-cann-toolkit_8.0.RC1.alpha001_linux-"$(uname -i)".run --install + +# 安装 CANN Kernels +wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C17SPC701/Ascend-cann-kernels-910b_8.0.RC1.alpha001_linux.run +bash Ascend-cann-kernels-910b_8.0.RC1.alpha001_linux.run --install + +# 设置环境变量 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +``` + +| 依赖项 | 至少 | 推荐 | +| ------------ | ------- | -------------- | +| CANN | 8.0.RC1 | 8.0.0.alpha002 | +| torch | 2.1.0 | 2.4.0 | +| torch-npu | 2.1.0 | 2.4.0.post2 | +| deepspeed | 0.13.2 | 0.13.2 | +| vllm-ascend | - | 0.7.3 | + +请使用 `ASCEND_RT_VISIBLE_DEVICES` 而非 `CUDA_VISIBLE_DEVICES` 来指定运算设备。 + +如果遇到无法正常推理的情况,请尝试设置 `do_sample: false`。 + +下载预构建 Docker 镜像:[32GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html) | [64GB](http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html) + +#### 安装 BitsAndBytes + +如果要在 Ascend NPU 上进行基于 bitsandbytes 的 QLoRA 量化微调,请执行如下步骤: + +1. 手动编译 bitsandbytes:请参考[安装文档](https://huggingface.co/docs/bitsandbytes/installation?backend=Ascend+NPU&platform=Ascend+NPU)完成 NPU 版的 bitsandbytes 安装,编译要求环境 cmake 版本不低于 3.22.1,g++ 版本不低于 12.x。 + +```bash +# 从源码安装 bitsandbytes +# 克隆 bitsandbytes 仓库, Ascend NPU 目前在 multi-backend-refactor 中支持 +git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git +cd bitsandbytes/ + +# 安装依赖 +pip install -r requirements-dev.txt + +# 安装编译工具依赖,该步骤在不同系统上命令有所不同,供参考 +apt-get install -y build-essential cmake + +# 编译 & 安装 +cmake -DCOMPUTE_BACKEND=npu -S . +make +pip install . +``` + +2. 安装 transformers 的 main 分支版本。 + +```bash +git clone -b main https://github.com/huggingface/transformers.git +cd transformers +pip install . +``` + +3. 在训练参数中设置 `double_quantization: false`,可参考[示例](examples/train_qlora/llama3_lora_sft_bnb_npu.yaml)。 + +
+ +### 数据准备 + +关于数据集文件的格式,请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope / Modelers 上的数据集或加载本地数据集。 + +> [!NOTE] +> 使用自定义数据集时,请更新 `data/dataset_info.json` 文件。 + +您也可以使用 **[Easy Dataset](https://github.com/ConardLi/easy-dataset)** 或 **[GraphGen](https://github.com/open-sciencelab/GraphGen)** 构建用于微调的合成数据。 + +### 快速开始 + +下面三行命令分别对 Llama3-8B-Instruct 模型进行 LoRA **微调**、**推理**和**合并**。 + +```bash +llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml +llamafactory-cli chat examples/inference/llama3_lora_sft.yaml +llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml +``` + +高级用法请参考 [examples/README_zh.md](examples/README_zh.md)(包括多 GPU 微调)。 + +> [!TIP] +> 使用 `llamafactory-cli help` 显示帮助信息。 +> +> 遇到报错请先看[常见问题](https://github.com/hiyouga/LLaMA-Factory/issues/4614)。 + +### LLaMA Board 可视化微调(由 [Gradio](https://github.com/gradio-app/gradio) 驱动) + +```bash +llamafactory-cli webui +``` + +### 构建 Docker + +CUDA 用户: + +```bash +cd docker/docker-cuda/ +docker compose up -d +docker compose exec llamafactory bash +``` + +昇腾 NPU 用户: + +```bash +cd docker/docker-npu/ +docker compose up -d +docker compose exec llamafactory bash +``` + +AMD ROCm 用户: + +```bash +cd docker/docker-rocm/ +docker compose up -d +docker compose exec llamafactory bash +``` + +
不使用 Docker Compose 构建 + +CUDA 用户: + +```bash +docker build -f ./docker/docker-cuda/Dockerfile \ + --build-arg PIP_INDEX=https://pypi.org/simple \ + --build-arg EXTRAS=metrics \ + -t llamafactory:latest . + +docker run -dit --ipc=host --gpus=all \ + -p 7860:7860 \ + -p 8000:8000 \ + --name llamafactory \ + llamafactory:latest + +docker exec -it llamafactory bash +``` + +昇腾 NPU 用户: + +```bash +docker build -f ./docker/docker-npu/Dockerfile \ + --build-arg PIP_INDEX=https://pypi.org/simple \ + --build-arg EXTRAS=torch-npu,metrics \ + -t llamafactory:latest . + +docker run -dit --ipc=host \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -p 7860:7860 \ + -p 8000:8000 \ + --device /dev/davinci0 \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + --name llamafactory \ + llamafactory:latest + +docker exec -it llamafactory bash +``` + +AMD ROCm 用户: + +```bash +docker build -f ./docker/docker-rocm/Dockerfile \ + --build-arg PIP_INDEX=https://pypi.org/simple \ + --build-arg EXTRAS=metrics \ + -t llamafactory:latest . + +docker run -dit --ipc=host \ + -p 7860:7860 \ + -p 8000:8000 \ + --device /dev/kfd \ + --device /dev/dri \ + --name llamafactory \ + llamafactory:latest + +docker exec -it llamafactory bash +``` + +
+ +
使用数据卷 + +您可以通过移除 Dockerfile 中 `VOLUME [ "/root/.cache/huggingface", "/app/shared_data", "/app/output" ]` 的注释来使用数据卷。 + +在构建 Docker 时使用参数 `-v ./hf_cache:/root/.cache/huggingface` 来挂载数据卷。各个数据卷的含义表示如下。 + +- `hf_cache`:使用宿主机的 Hugging Face 缓存文件夹。 +- `shared_data`:宿主机中存放数据集的文件夹路径。 +- `output`:将导出目录设置为该路径后,即可在宿主机中访问导出后的模型。 + +
+ +### 利用 vLLM 部署 OpenAI API + +```bash +API_PORT=8000 llamafactory-cli api examples/inference/llama3.yaml infer_backend=vllm vllm_enforce_eager=true +``` + +> [!TIP] +> API 文档请查阅[这里](https://platform.openai.com/docs/api-reference/chat/create)。 +> +> 示例:[图像理解](scripts/api_example/test_image.py) | [工具调用](scripts/api_example/test_toolcall.py) + +### 从魔搭社区下载 + +如果您在 Hugging Face 模型和数据集的下载中遇到了问题,可以通过下述方法使用魔搭社区。 + +```bash +export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1` +``` + +将 `model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型,例如 `LLM-Research/Meta-Llama-3-8B-Instruct`。 + +### 从魔乐社区下载 + +您也可以通过下述方法,使用魔乐社区下载数据集和模型。 + +```bash +export USE_OPENMIND_HUB=1 # Windows 使用 `set USE_OPENMIND_HUB=1` +``` + +将 `model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔乐社区](https://modelers.cn/models)查看所有可用的模型,例如 `TeleAI/TeleChat-7B-pt`。 + +### 使用 W&B 面板 + +若要使用 [Weights & Biases](https://wandb.ai) 记录实验数据,请在 yaml 文件中添加下面的参数。 + +```yaml +report_to: wandb +run_name: test_run # 可选 +``` + +在启动训练任务时,将 `WANDB_API_KEY` 设置为[密钥](https://wandb.ai/authorize)来登录 W&B 账户。 + +### 使用 SwanLab 面板 + +若要使用 [SwanLab](https://github.com/SwanHubX/SwanLab) 记录实验数据,请在 yaml 文件中添加下面的参数。 + +```yaml +use_swanlab: true +swanlab_run_name: test_run # 可选 +``` + +在启动训练任务时,登录SwanLab账户有以下三种方式: + +方式一:在 yaml 文件中添加 `swanlab_api_key=` ,并设置为你的 [API 密钥](https://swanlab.cn/settings)。 +方式二:将环境变量 `SWANLAB_API_KEY` 设置为你的 [API 密钥](https://swanlab.cn/settings)。 +方式三:启动前使用 `swanlab login` 命令完成登录。 + +## 使用了 LLaMA Factory 的项目 + +如果您有项目希望添加至下述列表,请通过邮件联系或者创建一个 PR。 + +
点击显示 + +1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223) +1. Yu et al. Open, Closed, or Small Language Models for Text Classification? 2023. [[arxiv]](https://arxiv.org/abs/2308.10092) +1. Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. [[arxiv]](https://arxiv.org/abs/2308.10526) +1. Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. [[arxiv]](https://arxiv.org/abs/2311.07816) +1. Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. [[arxiv]](https://arxiv.org/abs/2312.15710) +1. Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. KDD 2024. [[arxiv]](https://arxiv.org/abs/2401.04319) +1. Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2401.07286) +1. Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2402.05904) +1. Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. [[arxiv]](https://arxiv.org/abs/2402.07625) +1. Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11176) +1. Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. [[arxiv]](https://arxiv.org/abs/2402.11187) +1. Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. [[arxiv]](https://arxiv.org/abs/2402.11746) +1. Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11801) +1. Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2402.11809) +1. Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.11819) +1. Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. [[arxiv]](https://arxiv.org/abs/2402.12204) +1. Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2402.14714) +1. Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. ACL 2024. [[arxiv]](https://arxiv.org/abs/2402.15043) +1. Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. [[arxiv]](https://arxiv.org/abs/2403.02333) +1. Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. [[arxiv]](https://arxiv.org/abs/2403.03419) +1. Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. [[arxiv]](https://arxiv.org/abs/2403.08228) +1. Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2403.09073) +1. Zhang et al. EDT: Improving Large Language Models' Generation by Entropy-based Dynamic Temperature Sampling. 2024. [[arxiv]](https://arxiv.org/abs/2403.14541) +1. Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2403.15246) +1. Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. COLING 2024. [[arxiv]](https://arxiv.org/abs/2403.16008) +1. Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. [[arxiv]](https://arxiv.org/abs/2403.16443) +1. Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2404.00604) +1. Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.02827) +1. Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2404.04167) +1. Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. ICML 2024. [[arxiv]](https://arxiv.org/abs/2404.04316) +1. Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.07084) +1. Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.09836) +1. Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2404.11581) +1. Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. [[arxiv]](https://arxiv.org/abs/2404.14215) +1. Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2404.16621) +1. Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. ACL 2024 Findings. [[arxiv]](https://arxiv.org/abs/2404.17140) +1. Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. NAACL 2024. [[arxiv]](https://arxiv.org/abs/2404.18585) +1. Xu et al. Large Language Models for Cyber Security: A Systematic Literature Review. 2024. [[arxiv]](https://arxiv.org/abs/2405.04760) +1. Dammu et al. "They are uncultured": Unveiling Covert Harms and Social Threats in LLM Generated Conversations. 2024. [[arxiv]](https://arxiv.org/abs/2405.05378) +1. Yi et al. A safety realignment framework via subspace-oriented model fusion for large language models. 2024. [[arxiv]](https://arxiv.org/abs/2405.09055) +1. Lou et al. SPO: Multi-Dimensional Preference Sequential Alignment With Implicit Reward Modeling. 2024. [[arxiv]](https://arxiv.org/abs/2405.12739) +1. Zhang et al. Getting More from Less: Large Language Models are Good Spontaneous Multilingual Learners. 2024. [[arxiv]](https://arxiv.org/abs/2405.13816) +1. Zhang et al. TS-Align: A Teacher-Student Collaborative Framework for Scalable Iterative Finetuning of Large Language Models. 2024. [[arxiv]](https://arxiv.org/abs/2405.20215) +1. Zihong Chen. Sentence Segmentation and Sentence Punctuation Based on XunziALLM. 2024. [[paper]](https://aclanthology.org/2024.lt4hala-1.30) +1. Gao et al. The Best of Both Worlds: Toward an Honest and Helpful Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2406.00380) +1. Wang and Song. MARS: Benchmarking the Metaphysical Reasoning Abilities of Language Models with a Multi-task Evaluation Dataset. 2024. [[arxiv]](https://arxiv.org/abs/2406.02106) +1. Hu et al. Computational Limits of Low-Rank Adaptation (LoRA) for Transformer-Based Models. 2024. [[arxiv]](https://arxiv.org/abs/2406.03136) +1. Ge et al. Time Sensitive Knowledge Editing through Efficient Finetuning. ACL 2024. [[arxiv]](https://arxiv.org/abs/2406.04496) +1. Tan et al. Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions. 2024. [[arxiv]](https://arxiv.org/abs/2406.05688) +1. Song et al. Turbo Sparse: Achieving LLM SOTA Performance with Minimal Activated Parameters. 2024. [[arxiv]](https://arxiv.org/abs/2406.05955) +1. Gu et al. RWKV-CLIP: A Robust Vision-Language Representation Learner. 2024. [[arxiv]](https://arxiv.org/abs/2406.06973) +1. Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. [[arxiv]](https://arxiv.org/abs/2406.07115) +1. Zhu et al. Are Large Language Models Good Statisticians?. 2024. [[arxiv]](https://arxiv.org/abs/2406.07815) +1. Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2406.10099) +1. Ding et al. IntentionQA: A Benchmark for Evaluating Purchase Intention Comprehension Abilities of Language Models in E-commerce. 2024. [[arxiv]](https://arxiv.org/abs/2406.10173) +1. He et al. COMMUNITY-CROSS-INSTRUCT: Unsupervised Instruction Generation for Aligning Large Language Models to Online Communities. 2024. [[arxiv]](https://arxiv.org/abs/2406.12074) +1. Lin et al. FVEL: Interactive Formal Verification Environment with Large Language Models via Theorem Proving. 2024. [[arxiv]](https://arxiv.org/abs/2406.14408) +1. Treutlein et al. Connecting the Dots: LLMs can Infer and Verbalize Latent Structure from Disparate Training Data. 2024. [[arxiv]](https://arxiv.org/abs/2406.14546) +1. Feng et al. SS-Bench: A Benchmark for Social Story Generation and Evaluation. 2024. [[arxiv]](https://arxiv.org/abs/2406.15695) +1. Feng et al. Self-Constructed Context Decompilation with Fined-grained Alignment Enhancement. 2024. [[arxiv]](https://arxiv.org/abs/2406.17233) +1. Liu et al. Large Language Models for Cuffless Blood Pressure Measurement From Wearable Biosignals. 2024. [[arxiv]](https://arxiv.org/abs/2406.18069) +1. Iyer et al. Exploring Very Low-Resource Translation with LLMs: The University of Edinburgh's Submission to AmericasNLP 2024 Translation Task. AmericasNLP 2024. [[paper]](https://aclanthology.org/2024.americasnlp-1.25) +1. Li et al. Calibrating LLMs with Preference Optimization on Thought Trees for Generating Rationale in Science Question Scoring. 2024. [[arxiv]](https://arxiv.org/abs/2406.19949) +1. Yang et al. Financial Knowledge Large Language Model. 2024. [[arxiv]](https://arxiv.org/abs/2407.00365) +1. Lin et al. DogeRM: Equipping Reward Models with Domain Knowledge through Model Merging. 2024. [[arxiv]](https://arxiv.org/abs/2407.01470) +1. Bako et al. Evaluating the Semantic Profiling Abilities of LLMs for Natural Language Utterances in Data Visualization. 2024. [[arxiv]](https://arxiv.org/abs/2407.06129) +1. Huang et al. RoLoRA: Fine-tuning Rotated Outlier-free LLMs for Effective Weight-Activation Quantization. 2024. [[arxiv]](https://arxiv.org/abs/2407.08044) +1. Jiang et al. LLM-Collaboration on Automatic Science Journalism for the General Audience. 2024. [[arxiv]](https://arxiv.org/abs/2407.09756) +1. Inouye et al. Applied Auto-tuning on LoRA Hyperparameters. 2024. [[paper]](https://scholarcommons.scu.edu/cseng_senior/272/) +1. Qi et al. Research on Tibetan Tourism Viewpoints information generation system based on LLM. 2024. [[arxiv]](https://arxiv.org/abs/2407.13561) +1. Xu et al. Course-Correction: Safety Alignment Using Synthetic Preferences. 2024. [[arxiv]](https://arxiv.org/abs/2407.16637) +1. Sun et al. LAMBDA: A Large Model Based Data Agent. 2024. [[arxiv]](https://arxiv.org/abs/2407.17535) +1. Zhu et al. CollectiveSFT: Scaling Large Language Models for Chinese Medical Benchmark with Collective Instructions in Healthcare. 2024. [[arxiv]](https://arxiv.org/abs/2407.19705) +1. Yu et al. Correcting Negative Bias in Large Language Models through Negative Attention Score Alignment. 2024. [[arxiv]](https://arxiv.org/abs/2408.00137) +1. Xie et al. The Power of Personalized Datasets: Advancing Chinese Composition Writing for Elementary School through Targeted Model Fine-Tuning. IALP 2024. [[paper]](https://www.asianlp.sg/conferences/ialp2024/proceedings/papers/IALP2024_P055.pdf) +1. Liu et al. Instruct-Code-Llama: Improving Capabilities of Language Model in Competition Level Code Generation by Online Judge Feedback. ICIC 2024. [[paper]](https://link.springer.com/chapter/10.1007/978-981-97-5669-8_11) +1. Wang et al. Cybernetic Sentinels: Unveiling the Impact of Safety Data Selection on Model Security in Supervised Fine-Tuning. ICIC 2024. [[paper]](https://link.springer.com/chapter/10.1007/978-981-97-5669-8_23) +1. Xia et al. Understanding the Performance and Estimating the Cost of LLM Fine-Tuning. 2024. [[arxiv]](https://arxiv.org/abs/2408.04693) +1. Zeng et al. Perceive, Reflect, and Plan: Designing LLM Agent for Goal-Directed City Navigation without Instructions. 2024. [[arxiv]](https://arxiv.org/abs/2408.04168) +1. Xia et al. Using Pre-trained Language Model for Accurate ESG Prediction. FinNLP 2024. [[paper]](https://aclanthology.org/2024.finnlp-2.1/) +1. Liang et al. I-SHEEP: Self-Alignment of LLM from Scratch through an Iterative Self-Enhancement Paradigm. 2024. [[arxiv]](https://arxiv.org/abs/2408.08072) +1. Bai et al. Aligning Large Language Model with Direct Multi-Preference Optimization for Recommendation. CIKM 2024. [[paper]](https://dl.acm.org/doi/10.1145/3627673.3679611) +1. **[StarWhisper](https://github.com/Yu-Yang-Li/StarWhisper)**: 天文大模型 StarWhisper,基于 ChatGLM2-6B 和 Qwen-14B 在天文数据上微调而得。 +1. **[DISC-LawLLM](https://github.com/FudanDISC/DISC-LawLLM)**: 中文法律领域大模型 DISC-LawLLM,基于 Baichuan-13B 微调而得,具有法律推理和知识检索能力。 +1. **[Sunsimiao](https://github.com/X-D-Lab/Sunsimiao)**: 孙思邈中文医疗大模型 Sumsimiao,基于 Baichuan-7B 和 ChatGLM-6B 在中文医疗数据上微调而得。 +1. **[CareGPT](https://github.com/WangRongsheng/CareGPT)**: 医疗大模型项目 CareGPT,基于 LLaMA2-7B 和 Baichuan-13B 在中文医疗数据上微调而得。 +1. **[MachineMindset](https://github.com/PKU-YuanGroup/Machine-Mindset/)**:MBTI性格大模型项目,根据数据集与训练方式让任意 LLM 拥有 16 个不同的性格类型。 +1. **[Luminia-13B-v3](https://huggingface.co/Nekochu/Luminia-13B-v3)**:一个用于生成 Stable Diffusion 提示词的大型语言模型。[[demo]](https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt) +1. **[Chinese-LLaVA-Med](https://github.com/BUAADreamer/Chinese-LLaVA-Med)**:中文多模态医学大模型,基于 LLaVA-1.5-7B 在中文多模态医疗数据上微调而得。 +1. **[AutoRE](https://github.com/THUDM/AutoRE)**:基于大语言模型的文档级关系抽取系统。 +1. **[NVIDIA RTX AI Toolkit](https://github.com/NVIDIA/RTX-AI-Toolkit)**:在 Windows 主机上利用英伟达 RTX 设备进行大型语言模型微调的开发包。 +1. **[LazyLLM](https://github.com/LazyAGI/LazyLLM)**:一个低代码构建多 Agent 大模型应用的开发工具,支持基于 LLaMA Factory 的模型微调. +1. **[RAG-Retrieval](https://github.com/NLPJCL/RAG-Retrieval)**:一个全链路 RAG 检索模型微调、推理和蒸馏代码库。[[blog]](https://zhuanlan.zhihu.com/p/987727357) +1. **[360-LLaMA-Factory](https://github.com/Qihoo360/360-LLaMA-Factory)**:一个魔改后的代码库,通过 Ring Attention 支持长序列的 SFT 和 DPO 训练。 +1. **[Sky-T1](https://novasky-ai.github.io/posts/sky-t1/)**:由 NovaSky AI 微调的低成本类 o1 长推理模型。 +1. **[WeClone](https://github.com/xming521/WeClone)**:从聊天记录创造数字分身的一站式解决方案。 + +
+ +## 协议 + +本仓库的代码依照 [Apache-2.0](LICENSE) 协议开源。 + +使用模型权重时,请遵循对应的模型协议:[Baichuan 2](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf) / [BLOOM](https://huggingface.co/spaces/bigscience/license) / [ChatGLM3](https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE) / [Command R](https://cohere.com/c4ai-cc-by-nc-license) / [DeepSeek](https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL) / [Falcon](https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt) / [Gemma](https://ai.google.dev/gemma/terms) / [GLM-4](https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE) / [GPT-2](https://github.com/openai/gpt-2/blob/master/LICENSE) / [Granite](LICENSE) / [Index](https://huggingface.co/IndexTeam/Index-1.9B/blob/main/LICENSE) / [InternLM](https://github.com/InternLM/InternLM#license) / [Llama](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) / [Llama 2](https://ai.meta.com/llama/license/) / [Llama 3](https://llama.meta.com/llama3/license/) / [Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE) / [MiniCPM](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md) / [Mistral/Mixtral/Pixtral](LICENSE) / [OLMo](LICENSE) / [Phi-1.5/Phi-2](https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx) / [Phi-3/Phi-4](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE) / [Qwen](https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT) / [Skywork](https://huggingface.co/Skywork/Skywork-13B-base/blob/main/Skywork%20Community%20License.pdf) / [StarCoder 2](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) / [TeleChat2](https://huggingface.co/Tele-AI/telechat-7B/blob/main/TeleChat%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf) / [XVERSE](https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf) / [Yi](https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE) / [Yi-1.5](LICENSE) / [Yuan 2](https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan) + +## 引用 + +如果您觉得此项目有帮助,请考虑以下列格式引用 + +```bibtex +@inproceedings{zheng2024llamafactory, + title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models}, + author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma}, + booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)}, + address={Bangkok, Thailand}, + publisher={Association for Computational Linguistics}, + year={2024}, + url={http://arxiv.org/abs/2403.13372} +} +``` + +## 致谢 + +本项目受益于 [PEFT](https://github.com/huggingface/peft)、[TRL](https://github.com/huggingface/trl)、[QLoRA](https://github.com/artidoro/qlora) 和 [FastChat](https://github.com/lm-sys/FastChat),感谢以上诸位作者的付出。 + +## Star History + +![Star History Chart](https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&type=Date) diff --git a/Model/Model/LLaMA-Factory/assets/alaya_new.svg b/Model/Model/LLaMA-Factory/assets/alaya_new.svg new file mode 100644 index 0000000000000000000000000000000000000000..3568e1511fb758b46cfc84122554ea23c8a0b1c7 --- /dev/null +++ b/Model/Model/LLaMA-Factory/assets/alaya_new.svg @@ -0,0 +1,38 @@ + + + + + background + + + + Layer 1 + + + + + + + + + Open in Alaya NeW + + + + + diff --git a/Model/Model/LLaMA-Factory/assets/logo.png b/Model/Model/LLaMA-Factory/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..5fb3dd569342ca3cd30a582fd664145bd88b360c Binary files /dev/null and b/Model/Model/LLaMA-Factory/assets/logo.png differ diff --git a/Model/Model/LLaMA-Factory/assets/wechat.jpg b/Model/Model/LLaMA-Factory/assets/wechat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..40ec32c3a092f95a215cf61f5404ee8610884053 --- /dev/null +++ b/Model/Model/LLaMA-Factory/assets/wechat.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cf89f6586b461d5fb37bd915730b03e021be8355eb0c903e7dd6c7de2de1bdb +size 172853 diff --git a/Model/Model/LLaMA-Factory/assets/wechat_alaya.png b/Model/Model/LLaMA-Factory/assets/wechat_alaya.png new file mode 100644 index 0000000000000000000000000000000000000000..890205aca845802f75d8d7fbc71a6102e4d17375 --- /dev/null +++ b/Model/Model/LLaMA-Factory/assets/wechat_alaya.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83fecf09460bb1ed5f1ff88a8aee03ab4c7093303de062ead1659e1522917592 +size 213947 diff --git a/Model/Model/LLaMA-Factory/assets/wechat_npu.jpg b/Model/Model/LLaMA-Factory/assets/wechat_npu.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5c389ec141e927db74b3ed6293416167312f1233 --- /dev/null +++ b/Model/Model/LLaMA-Factory/assets/wechat_npu.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd9a9d1fee3541605ff6e493e3331be2b02d220b874e6e830256246521e94517 +size 170889 diff --git a/Model/Model/LLaMA-Factory/data/README.md b/Model/Model/LLaMA-Factory/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4cc92b43960dd78688010540418455448aabe63d --- /dev/null +++ b/Model/Model/LLaMA-Factory/data/README.md @@ -0,0 +1,475 @@ +The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it. + +The `dataset_info.json` file should be put in the `dataset_dir` directory. You can change `dataset_dir` to use another directory. The default value is `./data`. + +Currently we support datasets in **alpaca** and **sharegpt** format. Allowed file types include json, jsonl, csv, parquet, arrow. + +```json +"dataset_name": { + "hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore script_url, file_name and cloud_file_name)", + "ms_hub_url": "the name of the dataset repository on the Model Scope hub. (if specified, ignore script_url, file_name and cloud_file_name)", + "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore file_name and cloud_file_name)", + "cloud_file_name": "the name of the dataset file in s3/gcs cloud storage. (if specified, ignore file_name)", + "file_name": "the name of the dataset folder or dataset file in this directory. (required if above are not specified)", + "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})", + "ranking": "whether the dataset is a preference dataset or not. (default: False)", + "subset": "the name of the subset. (optional, default: None)", + "split": "the name of dataset split to be used. (optional, default: train)", + "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)", + "num_samples": "the number of samples in the dataset to be used. (optional, default: None)", + "columns (optional)": { + "prompt": "the column name in the dataset containing the prompts. (default: instruction)", + "query": "the column name in the dataset containing the queries. (default: input)", + "response": "the column name in the dataset containing the responses. (default: output)", + "history": "the column name in the dataset containing the histories. (default: None)", + "messages": "the column name in the dataset containing the messages. (default: conversations)", + "system": "the column name in the dataset containing the system prompts. (default: None)", + "tools": "the column name in the dataset containing the tool description. (default: None)", + "images": "the column name in the dataset containing the image inputs. (default: None)", + "videos": "the column name in the dataset containing the videos inputs. (default: None)", + "audios": "the column name in the dataset containing the audios inputs. (default: None)", + "chosen": "the column name in the dataset containing the chosen answers. (default: None)", + "rejected": "the column name in the dataset containing the rejected answers. (default: None)", + "kto_tag": "the column name in the dataset containing the kto tags. (default: None)" + }, + "tags (optional, used for the sharegpt format)": { + "role_tag": "the key in the message represents the identity. (default: from)", + "content_tag": "the key in the message represents the content. (default: value)", + "user_tag": "the value of the role_tag represents the user. (default: human)", + "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)", + "observation_tag": "the value of the role_tag represents the tool results. (default: observation)", + "function_tag": "the value of the role_tag represents the function call. (default: function_call)", + "system_tag": "the value of the role_tag represents the system prompt. (default: system, can override system column)" + } +} +``` + +## Alpaca Format + +### Supervised Fine-Tuning Dataset + +* [Example dataset](alpaca_en_demo.json) + +In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the user prompt, then the user prompt would be `instruction\ninput`. The `output` column represents the model response. + +For reasoning models, if the dataset contains chain-of-thought (CoT), the CoT needs to be placed in the model responses, such as `cotoutput`. + +The `system` column will be used as the system prompt if specified. + +The `history` column is a list consisting of string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning. + +```json +[ + { + "instruction": "user instruction (required)", + "input": "user input (optional)", + "output": "model response (required)", + "system": "system prompt (optional)", + "history": [ + ["user instruction in the first round (optional)", "model response in the first round (optional)"], + ["user instruction in the second round (optional)", "model response in the second round (optional)"] + ] + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "columns": { + "prompt": "instruction", + "query": "input", + "response": "output", + "system": "system", + "history": "history" + } +} +``` + +> [!TIP] +> If the model has reasoning capabilities (e.g. Qwen3) but the dataset does not contain chain-of-thought (CoT), LLaMA-Factory will automatically add empty CoT to the data. When `enable_thinking` is `True` (slow thinking, by default), the empty CoT will be added to the model responses and loss computation will be considered; otherwise (fast thinking), it will be added to the user prompts and loss computation will be ignored. Please keep the `enable_thinking` parameter consistent during training and inference. +> +> If you want to train data containing CoT with slow thinking and data without CoT with fast thinking, you can set `enable_thinking` to `None`. However, this feature is relatively complicated and should be used with caution. + +### Pre-training Dataset + +- [Example dataset](c4_demo.jsonl) + +In pre-training, only the `text` column will be used for model learning. + +```json +[ + {"text": "document"}, + {"text": "document"} +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "columns": { + "prompt": "text" + } +} +``` + +### Preference Dataset + +Preference datasets are used for reward modeling, DPO training, ORPO and SimPO training. + +It requires a better response in `chosen` column and a worse response in `rejected` column. + +```json +[ + { + "instruction": "user instruction (required)", + "input": "user input (optional)", + "chosen": "chosen answer (required)", + "rejected": "rejected answer (required)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "ranking": true, + "columns": { + "prompt": "instruction", + "query": "input", + "chosen": "chosen", + "rejected": "rejected" + } +} +``` + +### KTO Dataset + +An additional column `kto_tag` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Image Dataset + +An additional column `images` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Video Dataset + +An additional column `videos` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Audio Dataset + +An additional column `audios` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +## Sharegpt Format + +### Supervised Fine-Tuning Dataset + +- [Example dataset](glaive_toolcall_en_demo.json) + +Compared to the alpaca format, the sharegpt format allows the datasets have **more roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column. + +Note that the human and observation should appear in odd positions, while gpt and function should appear in even positions. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "user instruction" + }, + { + "from": "function_call", + "value": "tool arguments" + }, + { + "from": "observation", + "value": "tool result" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "system": "system prompt (optional)", + "tools": "tool description (optional)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "system": "system", + "tools": "tools" + } +} +``` + +### Pre-training Dataset + +Not yet supported, please use the [alpaca](#alpaca-format) format. + +### Preference Dataset + +- [Example dataset](dpo_en_demo.json) + +Preference datasets in sharegpt format also require a better message in `chosen` column and a worse message in `rejected` column. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "user instruction" + }, + { + "from": "gpt", + "value": "model response" + }, + { + "from": "human", + "value": "user instruction" + } + ], + "chosen": { + "from": "gpt", + "value": "chosen answer (required)" + }, + "rejected": { + "from": "gpt", + "value": "rejected answer (required)" + } + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "ranking": true, + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected" + } +} +``` + +### KTO Dataset + +- [Example dataset](kto_en_demo.json) + +KTO datasets require a extra `kto_tag` column containing the boolean human feedback. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "user instruction" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "kto_tag": "human feedback [true/false] (required)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "kto_tag": "kto_tag" + } +} +``` + +### Multimodal Image Dataset + +- [Example dataset](mllm_demo.json) + +Multimodal image datasets require an `images` column containing the paths to the input images. + +The number of images should be identical to the `` tokens in the conversations. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "user instruction" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "images": [ + "image path (required)" + ] + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "images": "images" + } +} +``` + +### Multimodal Video Dataset + +- [Example dataset](mllm_video_demo.json) + +Multimodal video datasets require a `videos` column containing the paths to the input videos. + +The number of videos should be identical to the `