ekurtic commited on
Commit
70371b1
·
verified ·
1 Parent(s): 3921b6a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +17 -17
README.md CHANGED
@@ -84,7 +84,7 @@ This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/
84
  from vllm import LLM, SamplingParams
85
  from transformers import AutoTokenizer
86
 
87
- model_id = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
88
  number_gpus = 1
89
  max_model_len = 8192
90
 
@@ -614,7 +614,7 @@ The results were obtained using the following commands:
614
  ```
615
  lm_eval \
616
  --model vllm \
617
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
618
  --tasks mmlu_llama_3.1_instruct \
619
  --fewshot_as_multiturn \
620
  --apply_chat_template \
@@ -626,7 +626,7 @@ lm_eval \
626
  ```
627
  lm_eval \
628
  --model vllm \
629
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=4064,max_gen_toks=1024,tensor_parallel_size=1 \
630
  --tasks mmlu_cot_0shot_llama_3.1_instruct \
631
  --apply_chat_template \
632
  --num_fewshot 0 \
@@ -637,7 +637,7 @@ lm_eval \
637
  ```
638
  lm_eval \
639
  --model vllm \
640
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3940,max_gen_toks=100,tensor_parallel_size=1 \
641
  --tasks arc_challenge_llama_3.1_instruct \
642
  --apply_chat_template \
643
  --num_fewshot 0 \
@@ -648,7 +648,7 @@ lm_eval \
648
  ```
649
  lm_eval \
650
  --model vllm \
651
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=4096,max_gen_toks=1024,tensor_parallel_size=1 \
652
  --tasks gsm8k_cot_llama_3.1_instruct \
653
  --fewshot_as_multiturn \
654
  --apply_chat_template \
@@ -660,7 +660,7 @@ lm_eval \
660
  ```
661
  lm_eval \
662
  --model vllm \
663
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
664
  --tasks hellaswag \
665
  --num_fewshot 10 \
666
  --batch_size auto
@@ -670,7 +670,7 @@ lm_eval \
670
  ```
671
  lm_eval \
672
  --model vllm \
673
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
674
  --tasks winogrande \
675
  --num_fewshot 5 \
676
  --batch_size auto
@@ -680,7 +680,7 @@ lm_eval \
680
  ```
681
  lm_eval \
682
  --model vllm \
683
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
684
  --tasks truthfulqa \
685
  --num_fewshot 0 \
686
  --batch_size auto
@@ -690,7 +690,7 @@ lm_eval \
690
  ```
691
  lm_eval \
692
  --model vllm \
693
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=4096,tensor_parallel_size=1,enable_chunked_prefill=True \
694
  --apply_chat_template \
695
  --fewshot_as_multiturn \
696
  --tasks leaderboard \
@@ -701,7 +701,7 @@ lm_eval \
701
  ```
702
  lm_eval \
703
  --model vllm \
704
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
705
  --tasks mmlu_pt_llama_3.1_instruct \
706
  --fewshot_as_multiturn \
707
  --apply_chat_template \
@@ -713,7 +713,7 @@ lm_eval \
713
  ```
714
  lm_eval \
715
  --model vllm \
716
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
717
  --tasks mmlu_es_llama_3.1_instruct \
718
  --fewshot_as_multiturn \
719
  --apply_chat_template \
@@ -725,7 +725,7 @@ lm_eval \
725
  ```
726
  lm_eval \
727
  --model vllm \
728
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
729
  --tasks mmlu_it_llama_3.1_instruct \
730
  --fewshot_as_multiturn \
731
  --apply_chat_template \
@@ -737,7 +737,7 @@ lm_eval \
737
  ```
738
  lm_eval \
739
  --model vllm \
740
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
741
  --tasks mmlu_de_llama_3.1_instruct \
742
  --fewshot_as_multiturn \
743
  --apply_chat_template \
@@ -749,7 +749,7 @@ lm_eval \
749
  ```
750
  lm_eval \
751
  --model vllm \
752
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
753
  --tasks mmlu_fr_llama_3.1_instruct \
754
  --fewshot_as_multiturn \
755
  --apply_chat_template \
@@ -761,7 +761,7 @@ lm_eval \
761
  ```
762
  lm_eval \
763
  --model vllm \
764
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
765
  --tasks mmlu_hi_llama_3.1_instruct \
766
  --fewshot_as_multiturn \
767
  --apply_chat_template \
@@ -773,7 +773,7 @@ lm_eval \
773
  ```
774
  lm_eval \
775
  --model vllm \
776
- --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
777
  --tasks mmlu_th_llama_3.1_instruct \
778
  --fewshot_as_multiturn \
779
  --apply_chat_template \
@@ -785,7 +785,7 @@ lm_eval \
785
  ##### Generation
786
  ```
787
  python3 codegen/generate.py \
788
- --model neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 \
789
  --bs 16 \
790
  --temperature 0.2 \
791
  --n_samples 50 \
 
84
  from vllm import LLM, SamplingParams
85
  from transformers import AutoTokenizer
86
 
87
+ model_id = "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
88
  number_gpus = 1
89
  max_model_len = 8192
90
 
 
614
  ```
615
  lm_eval \
616
  --model vllm \
617
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
618
  --tasks mmlu_llama_3.1_instruct \
619
  --fewshot_as_multiturn \
620
  --apply_chat_template \
 
626
  ```
627
  lm_eval \
628
  --model vllm \
629
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=4064,max_gen_toks=1024,tensor_parallel_size=1 \
630
  --tasks mmlu_cot_0shot_llama_3.1_instruct \
631
  --apply_chat_template \
632
  --num_fewshot 0 \
 
637
  ```
638
  lm_eval \
639
  --model vllm \
640
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3940,max_gen_toks=100,tensor_parallel_size=1 \
641
  --tasks arc_challenge_llama_3.1_instruct \
642
  --apply_chat_template \
643
  --num_fewshot 0 \
 
648
  ```
649
  lm_eval \
650
  --model vllm \
651
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=4096,max_gen_toks=1024,tensor_parallel_size=1 \
652
  --tasks gsm8k_cot_llama_3.1_instruct \
653
  --fewshot_as_multiturn \
654
  --apply_chat_template \
 
660
  ```
661
  lm_eval \
662
  --model vllm \
663
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
664
  --tasks hellaswag \
665
  --num_fewshot 10 \
666
  --batch_size auto
 
670
  ```
671
  lm_eval \
672
  --model vllm \
673
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
674
  --tasks winogrande \
675
  --num_fewshot 5 \
676
  --batch_size auto
 
680
  ```
681
  lm_eval \
682
  --model vllm \
683
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
684
  --tasks truthfulqa \
685
  --num_fewshot 0 \
686
  --batch_size auto
 
690
  ```
691
  lm_eval \
692
  --model vllm \
693
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=4096,tensor_parallel_size=1,enable_chunked_prefill=True \
694
  --apply_chat_template \
695
  --fewshot_as_multiturn \
696
  --tasks leaderboard \
 
701
  ```
702
  lm_eval \
703
  --model vllm \
704
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
705
  --tasks mmlu_pt_llama_3.1_instruct \
706
  --fewshot_as_multiturn \
707
  --apply_chat_template \
 
713
  ```
714
  lm_eval \
715
  --model vllm \
716
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
717
  --tasks mmlu_es_llama_3.1_instruct \
718
  --fewshot_as_multiturn \
719
  --apply_chat_template \
 
725
  ```
726
  lm_eval \
727
  --model vllm \
728
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
729
  --tasks mmlu_it_llama_3.1_instruct \
730
  --fewshot_as_multiturn \
731
  --apply_chat_template \
 
737
  ```
738
  lm_eval \
739
  --model vllm \
740
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
741
  --tasks mmlu_de_llama_3.1_instruct \
742
  --fewshot_as_multiturn \
743
  --apply_chat_template \
 
749
  ```
750
  lm_eval \
751
  --model vllm \
752
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
753
  --tasks mmlu_fr_llama_3.1_instruct \
754
  --fewshot_as_multiturn \
755
  --apply_chat_template \
 
761
  ```
762
  lm_eval \
763
  --model vllm \
764
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
765
  --tasks mmlu_hi_llama_3.1_instruct \
766
  --fewshot_as_multiturn \
767
  --apply_chat_template \
 
773
  ```
774
  lm_eval \
775
  --model vllm \
776
+ --model_args pretrained="RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
777
  --tasks mmlu_th_llama_3.1_instruct \
778
  --fewshot_as_multiturn \
779
  --apply_chat_template \
 
785
  ##### Generation
786
  ```
787
  python3 codegen/generate.py \
788
+ --model RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 \
789
  --bs 16 \
790
  --temperature 0.2 \
791
  --n_samples 50 \