| | |
| | |
| | |
| | |
| | @@ -175,6 +175,7 @@ debug.py |
| | wandb/ |
| | nohup.out |
| | lm-evaluation-harness/ |
| | +bigcode-evaluation-harness/ |
| | results/**/*.json |
| | results/**/*.jsonl |
| | results/**/*.db |
| | |
| | |
| | |
| | |
| | @@ -26,6 +26,11 @@ bash scripts/data.sh |
| | git clone https://github.com/EleutherAI/lm-evaluation-harness.git |
| | cd lm-evaluation-harness |
| | pip install -e . |
| | +# commit: 9cfa52b |
| | +git clone https://github.com/bigcode-project/bigcode-evaluation-harness.git |
| | +cd bigcode-evaluation-harness |
| | +# change `pyext==0.5` in `bigcode-evaluation-harness/requirements.txt`, ref: https://github.com/bigcode-project/bigcode-evaluation-harness/pull/181 |
| | +pip install -e . |
| | ``` |
| | |
| | ## 📃 TODO |
| | |
| | deleted file mode 100644 |
| | |
| | |
| | |
| | @@ -1,96 +0,0 @@ |
| | -# nohup srun -p MoE --gres gpu:1 bash scripts/eval.sh all /mnt/petrelfs/share_data/quxiaoye/models/Sheared-LLaMA-2.7B True results/Sheared-LLaMA-2.7B 1>logs/eval-all-Sheared-LLaMA-2.7B.log 2>&1 & |
| | - |
| | -mmlu() { |
| | - # MMLU: https://github.com/princeton-nlp/LLM-Shearing/blob/20ebd2645a8ff5fa65874e1347f9891b80e01805/icl_eval/run_eval.sh#L18 |
| | - MODEL=$1 |
| | - TRUST_REMOTE_CODE=$2 |
| | - RESULT_DIR=$3 |
| | - mkdir -p $RESULT_DIR |
| | - |
| | - lm_eval \ |
| | - --model hf \ |
| | - --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \ |
| | - --tasks mmlu_computer_security,mmlu_high_school_chemistry,mmlu_philosophy,mmlu_elementary_mathematics,mmlu_prehistory,mmlu_formal_logic,mmlu_high_school_mathematics,mmlu_econometrics,mmlu_moral_scenarios,mmlu_college_mathematics,mmlu_high_school_government_and_politics,mmlu_us_foreign_policy,mmlu_high_school_world_history,mmlu_conceptual_physics,mmlu_college_medicine,mmlu_international_law,mmlu_abstract_algebra,mmlu_logical_fallacies,mmlu_machine_learning,mmlu_medical_genetics,mmlu_public_relations,mmlu_college_biology,mmlu_marketing,mmlu_electrical_engineering,mmlu_anatomy,mmlu_high_school_us_history,mmlu_high_school_biology,mmlu_miscellaneous,mmlu_high_school_psychology,mmlu_sociology,mmlu_business_ethics,mmlu_high_school_geography,mmlu_human_aging,mmlu_high_school_statistics,mmlu_moral_disputes,mmlu_professional_psychology,mmlu_global_facts,mmlu_college_physics,mmlu_nutrition,mmlu_high_school_macroeconomics,mmlu_world_religions,mmlu_professional_medicine,mmlu_high_school_computer_science,mmlu_college_chemistry,mmlu_human_sexuality,mmlu_high_school_microeconomics,mmlu_astronomy,mmlu_professional_accounting,mmlu_high_school_european_history,mmlu_jurisprudence,mmlu_professional_law,mmlu_high_school_physics,mmlu_virology,mmlu_management,mmlu_college_computer_science,mmlu_clinical_knowledge,mmlu_security_studies \ |
| | - --num_fewshot 5 \ |
| | - --device cuda:0 \ |
| | - --batch_size auto \ |
| | - --verbosity DEBUG \ |
| | - --output_path $RESULT_DIR/mmlu.json |
| | -} |
| | - |
| | -bbh() { |
| | - # Big Bench Hard (BBH): https://arxiv.org/pdf/2210.09261.pdf |
| | - MODEL=$1 |
| | - TRUST_REMOTE_CODE=$2 |
| | - RESULT_DIR=$3 |
| | - mkdir -p $RESULT_DIR |
| | - |
| | - lm_eval \ |
| | - --log_samples \ |
| | - --model hf \ |
| | - --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \ |
| | - --tasks bbh_fewshot_boolean_expressions,bbh_fewshot_causal_judgement,bbh_fewshot_date_understanding,bbh_fewshot_disambiguation_qa,bbh_fewshot_dyck_languages,bbh_fewshot_formal_fallacies,bbh_fewshot_geometric_shapes,bbh_fewshot_hyperbaton,bbh_fewshot_logical_deduction_five_objects,bbh_fewshot_logical_deduction_seven_objects,bbh_fewshot_logical_deduction_three_objects,bbh_fewshot_movie_recommendation,bbh_fewshot_multistep_arithmetic_two,bbh_fewshot_navigate,bbh_fewshot_object_counting,bbh_fewshot_penguins_in_a_table,bbh_fewshot_reasoning_about_colored_objects,bbh_fewshot_ruin_names,bbh_fewshot_salient_translation_error_detection,bbh_fewshot_snarks,bbh_fewshot_sports_understanding,bbh_fewshot_temporal_sequences,bbh_fewshot_tracking_shuffled_objects_five_objects,bbh_fewshot_tracking_shuffled_objects_seven_objects,bbh_fewshot_tracking_shuffled_objects_three_objects,bbh_fewshot_web_of_lies,bbh_fewshot_word_sorting \ |
| | - --device cuda:0 \ |
| | - --batch_size auto \ |
| | - --verbosity DEBUG \ |
| | - --output_path $RESULT_DIR/bbh.json |
| | -} |
| | - |
| | -reasoning() { |
| | - MODEL=$1 |
| | - TRUST_REMOTE_CODE=$2 |
| | - RESULT_DIR=$3 |
| | - mkdir -p $RESULT_DIR |
| | - |
| | - lm_eval \ |
| | - --log_samples \ |
| | - --model hf \ |
| | - --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \ |
| | - --tasks gsm8k_cot \ |
| | - --device cuda:0 \ |
| | - --batch_size auto \ |
| | - --verbosity DEBUG \ |
| | - --output_path $RESULT_DIR/reasoning.json |
| | -} |
| | - |
| | -qa() { |
| | - MODEL=$1 |
| | - TRUST_REMOTE_CODE=$2 |
| | - RESULT_DIR=$3 |
| | - mkdir -p $RESULT_DIR |
| | - |
| | - lm_eval \ |
| | - --log_samples \ |
| | - --model hf \ |
| | - --model_args pretrained=$MODEL,trust_remote_code=$TRUST_REMOTE_CODE \ |
| | - --tasks arc_easy,arc_challenge,boolq \ |
| | - --num_fewshot 0 \ |
| | - --device cuda:0 \ |
| | - --batch_size auto \ |
| | - --verbosity DEBUG \ |
| | - --output_path $RESULT_DIR/qa.json |
| | -} |
| | - |
| | -EVAL_TASK=$1 |
| | -shift 1 |
| | -start=$(date +%s) |
| | -case $EVAL_TASK in |
| | - mmlu) |
| | - mmlu $* ;; |
| | - bbh) |
| | - bbh $* ;; |
| | - reasoning) |
| | - reasoning $* ;; |
| | - qa) |
| | - qa $* ;; |
| | - all) |
| | - mmlu $* |
| | - bbh $* |
| | - reasoning $* |
| | - qa $* |
| | - ;; |
| | - *) |
| | - echo "$EVAL_TASK not recognized!";; |
| | -esac |
| | -end=$(date +%s) |
| | -echo "Elapsed Time: $(($end-$start)) seconds" |
| | |
| | |
| | |
| | |
| | @@ -83,8 +83,11 @@ num_gpus=4 |
| | |
| | python -m src.eval.gen_mt_ans \ |
| | --model-path $output_dir \ |
| | - --model-id $task_name \ |
| | - --num-gpus-total $num_gpus |
| | + --model-id $task_name |
| | + |
| | + python -m src.eval.gen_alpaca_eval_ans \ |
| | + --model-path $output_dir \ |
| | + --model-id $task_name |
| | } |
| | |
| | # nohup srun -p MoE --ntasks-per-node=1 --cpus-per-task=16 --mem=128G --nodes=1 --gres=gpu:4 bash "/mnt/petrelfs/zhutong/adaptive-sft-for-moe/scripts/one_data_steps_dynamic.sh" "llama_moe_orca_epochs_cluster_4" "auto" "/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new" "data/open_orca_clustered/4" "data/open_orca_clustered_eval/4" 1>logs/llama_moe_orca_cluster_4_dynamic.log 2>&1 & |
| | |
| | deleted file mode 100644 |
| | |
| | |
| | |
| | @@ -1,32 +0,0 @@ |
| | -#!/usr/bin/bash |
| | - |
| | -#SBATCH --job-name=moe_gen |
| | -#SBATCH --output=logs/%x-%j.log |
| | -#SBATCH --error=logs/%x-%j.log |
| | - |
| | -#SBATCH --partition=MoE |
| | -#SBATCH --ntasks-per-node=1 |
| | -#SBATCH --cpus-per-task=16 |
| | -#SBATCH --mem=64G |
| | - |
| | -#SBATCH --nodes=1 |
| | -#SBATCH --gres=gpu:1 |
| | -#SBATCH --quotatype=auto |
| | - |
| | -{ |
| | - # python -m fastchat.llm_judge.gen_model_answer \ |
| | - # --model-path outputs/sheared_llama_sharegpt/moe_sft-2411306 \ |
| | - # --model-id sheared_llama_sharegpt |
| | - |
| | - # python -m fastchat.llm_judge.gen_model_answer \ |
| | - # --model-path outputs/sheared_llama_uniform_mix/moe_sft-2421072 \ |
| | - # --model-id sheared_llama_uniform_mix |
| | - |
| | - bash scripts/cp_model_files.sh outputs/llama_moe/moe_sft-2409782 |
| | - python -m fastchat.llm_judge.gen_model_answer \ |
| | - --model-path outputs/llama_moe/moe_sft-2409782 \ |
| | - --model-id llama_moe_uniform_mix |
| | -} |
| | - |
| | -# nohup srun -p MoE -n1 -N1 --gres=gpu:1 --quotatype spot python -m fastchat.llm_judge.gen_model_answer --model-path outputs/sheared_llama_sharegpt/moe_sft-2411306 --model-id sheared_llama_sharegpt 1>logs/mt_bench_gen_sheared_llama_sharegpt.log 2>&1 & |
| | -# nohup srun -p MoE -n1 -N1 --gres=gpu:1 --quotatype spot python -m fastchat.llm_judge.gen_model_answer --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/llama_moe_sharegpt/moe_sft-2411309 --model-id llama_moe_sharegpt 1>logs/mt_bench_gen_llama_moe_sharegpt.log 2>&1 & |
| | |
| | |
| | |
| | |
| | @@ -100,5 +100,8 @@ nohup srun -p MoE --ntasks-per-node=1 --cpus-per-task=16 --mem=128G --nodes=1 -- |
| | nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_mt_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/llama_moe_four_mix_uniform/bash-2485396 --model-id llama_moe_four_mix_uniform 1>logs/gen_mt_ans-llama_moe_four_mix_uniform.log 2>&1 & |
| | nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_mt_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/sheared_four_mix_uniform/bash-2485397 --model-id sheared_four_mix_uniform 1>logs/gen_mt_ans-sheared_four_mix_uniform.log 2>&1 & |
| | |
| | -nohup srun -p MoE --gres gpu:1 python -m src.eval.get_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/llama_moe_four_mix_uniform/bash-2485396 --model-id llama_moe_four_mix_uniform 1>logs/gen_alpaca_eval-llama_moe_four_mix_uniform.log 2>&1 & |
| | -nohup srun -p MoE --gres gpu:1 python -m src.eval.get_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/sheared_four_mix_uniform/bash-2485397 --model-id sheared_four_mix_uniform 1>logs/gen_alpaca_eval-sheared_four_mix_uniform.log 2>&1 & |
| | +nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/llama_moe_four_mix_uniform/bash-2485396 --model-id llama_moe_four_mix_uniform 1>logs/gen_alpaca_eval-llama_moe_four_mix_uniform.log 2>&1 & |
| | +nohup srun -p MoE --gres gpu:1 python -m src.eval.gen_alpaca_eval_ans --model-path /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048/sheared_four_mix_uniform/bash-2485397 --model-id sheared_four_mix_uniform 1>logs/gen_alpaca_eval-sheared_four_mix_uniform.log 2>&1 & |
| | + |
| | +nohup srun -p MoE --gres gpu:1 bash scripts/eval/eval.sh reasoning /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad_wo_gate_noise/moe_sft-2492650 True results/llama_moe_four_mix_wo_pad_wo_gate_noise 1>logs/eval-reasoning-llama_moe_four_mix_wo_pad_wo_gate_noise.log 2>&1 & |
| | +nohup srun -p MoE --gres gpu:1 bash scripts/eval/eval.sh reasoning /mnt/petrelfs/zhutong/adaptive-sft-for-moe/outputs/len2048_dynamic_remove_padding_tokens/llama_moe_four_mix_wo_pad/moe_sft-2491633 True results/llama_moe_four_mix_wo_pad 1>logs/eval-reasoning-llama_moe_four_mix_wo_pad.log 2>&1 & |
| | |
| | |
| | |
| | |
| | @@ -6,6 +6,7 @@ import torch |
| | import numpy as np |
| | from loguru import logger |
| | from transformers.trainer_callback import TrainerCallback, TrainerState, TrainerControl |
| | +from transformers.utils import is_flash_attn_2_available |
| | |
| | from src.utils.config import TrainingArguments |
| | from src.utils.io import append_jsonlines |
| | @@ -22,6 +23,7 @@ class AdaptiveSamplingCallback(TrainerCallback): |
| | criterion: Optional[Literal["min", "max", "mean"]] = "mean", |
| | sim_type: Optional[Literal["cos", "l2"]] = "cos", |
| | ): |
| | + assert is_flash_attn_2_available(), "Make sure you have flash-attn installed" |
| | self.criterion = criterion |
| | self.sim_type = sim_type |
| | self.prob_map = {} |
| | |
| | |
| | |
| | |
| | @@ -117,7 +117,9 @@ def train(): |
| | train_dataset = SubDirWeightedPackedJsonlDataset( |
| | data_args.dataset_dir_or_path, |
| | tokenizer, |
| | - prob_map=get_uniform_sampling_ratio(data_args.dataset_dir_or_path), |
| | + # prob_map=get_uniform_sampling_ratio(data_args.dataset_dir_or_path), |
| | + # prob_map={"code": 0.25119094959816823, "math": 0.2674581878910902, "orca": 0.243050776175138, "sharegpt": 0.23830008633560357}, |
| | + prob_map=data_args.prob_map, |
| | seed=training_args.seed, |
| | ) |
| | elif datapath.is_file(): |
| | |
| | deleted file mode 100644 |
| | |
| | |
| | |
| | @@ -1,113 +0,0 @@ |
| | -import argparse |
| | -from pathlib import Path |
| | - |
| | -import torch |
| | -import datasets |
| | -from tqdm import tqdm |
| | - |
| | -from src.core.train import get_model_and_tokenizer |
| | -from src.utils.conversation import Conversation |
| | -from src.utils.io import dump_json |
| | - |
| | - |
| | -@torch.inference_mode() |
| | -def run_eval(model_path, model_id, max_new_tokens): |
| | - model, tokenizer = get_model_and_tokenizer( |
| | - "auto", |
| | - model_path, |
| | - torch_dtype=torch.bfloat16, |
| | - trust_remote_code=True, |
| | - ) |
| | - model.cuda() |
| | - model.eval() |
| | - |
| | - conv = Conversation() |
| | - outputs = [] |
| | - eval_set = datasets.load_dataset("tatsu-lab/alpaca_eval", "alpaca_eval")["eval"] |
| | - for example in tqdm(eval_set, desc="Eval"): |
| | - conv.append_message(conv.roles[0], example["instruction"]) |
| | - conv.append_message(conv.roles[1], None) |
| | - prompt = conv.get_prompt() |
| | - input_ids = tokenizer([prompt], return_tensors="pt").input_ids |
| | - conv.clear_msg() |
| | - # generate here is a placeholder for your models generations |
| | - output_ids = model.generate( |
| | - input_ids.cuda(), |
| | - do_sample=False, |
| | - temperature=0.0, |
| | - max_new_tokens=max_new_tokens, |
| | - ) |
| | - if model.config.is_encoder_decoder: |
| | - output_ids = output_ids[0] |
| | - else: |
| | - output_ids = output_ids[0][len(input_ids[0]) :] # noqa: E203 |
| | - # be consistent with the template's stop_token_ids |
| | - if conv.stop_token_ids: |
| | - stop_token_ids_index = [ |
| | - i |
| | - for i, id in enumerate(output_ids) |
| | - if id in conv.stop_token_ids |
| | - ] |
| | - if len(stop_token_ids_index) > 0: |
| | - output_ids = output_ids[: stop_token_ids_index[0]] |
| | - |
| | - output = tokenizer.decode( |
| | - output_ids, |
| | - spaces_between_special_tokens=False, |
| | - ) |
| | - if conv.stop_str and isinstance(conv.stop_str, list): |
| | - stop_str_indices = sorted( |
| | - [ |
| | - output.find(stop_str) |
| | - for stop_str in conv.stop_str |
| | - if output.find(stop_str) > 0 |
| | - ] |
| | - ) |
| | - if len(stop_str_indices) > 0: |
| | - output = output[: stop_str_indices[0]] |
| | - elif conv.stop_str and output.find(conv.stop_str) > 0: |
| | - output = output[: output.find(conv.stop_str)] |
| | - |
| | - for special_token in tokenizer.special_tokens_map.values(): |
| | - if isinstance(special_token, list): |
| | - for special_tok in special_token: |
| | - output = output.replace(special_tok, "") |
| | - else: |
| | - output = output.replace(special_token, "") |
| | - output = output.strip() |
| | - |
| | - if conv.name == "xgen" and output.startswith("Assistant:"): |
| | - output = output.replace("Assistant:", "", 1).strip() |
| | - |
| | - example["output"] = output |
| | - outputs.append(example) |
| | - |
| | - outpath = Path("results/alpaca_eval") / f"{model_id}.json" |
| | - dump_json(outputs, outpath, indent=2) |
| | - |
| | - |
| | -if __name__ == "__main__": |
| | - parser = argparse.ArgumentParser() |
| | - parser.add_argument( |
| | - "--model-path", |
| | - type=str, |
| | - required=True, |
| | - help="The path to the weights. This can be a local folder or a Hugging Face repo ID.", |
| | - ) |
| | - parser.add_argument( |
| | - "--model-id", type=str, required=True, help="A custom name for the model." |
| | - ) |
| | - parser.add_argument( |
| | - "--max-new-token", |
| | - type=int, |
| | - default=1024, |
| | - help="The maximum number of new generated tokens.", |
| | - ) |
| | - |
| | - args = parser.parse_args() |
| | - |
| | - run_eval( |
| | - model_path=args.model_path, |
| | - model_id=args.model_id, |
| | - max_new_tokens=args.max_new_token, |
| | - ) |
| | |
| | |
| | |
| | |
| | @@ -6,6 +6,7 @@ import torch |
| | import transformers |
| | |
| | from src.utils.io import load_json |
| | +from src.data import get_uniform_sampling_ratio |
| | |
| | |
| | @dataclass |
| | @@ -33,7 +34,9 @@ class ModelArguments: |
| | ) |
| | attn_impl: str = field( |
| | default="flash_attention_2", |
| | - metadata={"help": "attention implementation, choice from [eager, flash_attention_2, sdpa] (default: `flash_attention_2`)"} |
| | + metadata={ |
| | + "help": "attention implementation, choice from [eager, flash_attention_2, sdpa] (default: `flash_attention_2`)" |
| | + }, |
| | ) |
| | |
| | def __post_init__(self): |
| | @@ -56,6 +59,18 @@ class DataArguments: |
| | default="data/merged", |
| | metadata={"help": "Path to dataset directory or a single jsonl file"}, |
| | ) |
| | + prob_map: str = field( |
| | + default=None, |
| | + metadata={"help": "Path to the probability map file"}, |
| | + ) |
| | + |
| | + def __post_init__(self): |
| | + if self.prob_map is not None: |
| | + if not pathlib.Path(self.prob_map).exists(): |
| | + raise ValueError(f"Probability map file {self.prob_map} not found") |
| | + self.prob_map = load_json(self.prob_map) |
| | + else: |
| | + self.prob_map = get_uniform_sampling_ratio(self.dataset_dir_or_path) |
| | |
| | |
| | @dataclass |
| | @@ -70,9 +85,7 @@ class TrainingArguments(transformers.TrainingArguments): |
| | ) |
| | max_eval_steps_per_type: int = field( |
| | default=10, |
| | - metadata={ |
| | - "help": "Maximum number of steps to perform during evaluation." |
| | - }, |
| | + metadata={"help": "Maximum number of steps to perform during evaluation."}, |
| | ) |
| | dynamic_sampling_sim_type: Literal["cos", "l2"] = field( |
| | default="l2", |
| | @@ -88,7 +101,5 @@ class TrainingArguments(transformers.TrainingArguments): |
| | ) |
| | freeze_gate: bool = field( |
| | default=False, |
| | - metadata={ |
| | - "help": "Whether to freeze the gate during training." |
| | - }, |
| | + metadata={"help": "Whether to freeze the gate during training."}, |
| | ) |
| |
|