| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -euo pipefail |
|
|
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| PROJECT_DIR="$(dirname "$SCRIPT_DIR")" |
|
|
| |
| CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}" |
| TIMESTAMP="$(date +%Y%m%d_%H%M%S)" |
| OUTPUT_DIR="${2:-eval/outputs/full_${TIMESTAMP}}" |
|
|
| [[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT" |
| [[ "$OUTPUT_DIR" != /* ]] && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR" |
|
|
| |
| HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")" |
| TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json" |
|
|
| |
| |
| |
| USE_MULTI_GPU="${USE_MULTI_GPU:-0}" |
| if [ "$USE_MULTI_GPU" = "1" ]; then |
| MODEL_EXTRA_ARGS=",parallelize=True" |
| echo "βΆ λ©ν° GPU λͺ¨λ νμ±ν (device_map=auto)" |
| else |
| MODEL_EXTRA_ARGS="" |
| CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" |
| fi |
|
|
| BATCH_SIZE="${BATCH_SIZE:-auto}" |
| NUM_FEWSHOT="${NUM_FEWSHOT:-0}" |
|
|
| |
| |
| TASKS_CORE="kobest,haerae,paws_ko" |
|
|
| |
| TASKS_EXTENDED="global_mmlu_ko" |
|
|
| |
| TASKS_OPTIONAL="kormedmcqa" |
|
|
| |
| TASKS="${TASKS_CORE},${TASKS_EXTENDED}" |
|
|
| |
| check_dep() { |
| python3 -c "import $1" 2>/dev/null || { echo "β $1 not found. pip install $2"; exit 1; } |
| } |
| check_dep lm_eval lm-eval |
| check_dep transformers transformers |
| check_dep safetensors safetensors |
|
|
| echo "==================================================" |
| echo " Ko-LLM Full Benchmark Evaluation" |
| echo "==================================================" |
| echo " Checkpoint : $CHECKPOINT" |
| echo " HF output : $HF_MODEL_DIR" |
| echo " Tasks : $TASKS" |
| echo " Few-shot : $NUM_FEWSHOT" |
| echo " Batch size : $BATCH_SIZE" |
| echo " Output : $OUTPUT_DIR" |
| echo " Multi-GPU : $USE_MULTI_GPU" |
| echo " Start time : $(date)" |
| echo "==================================================" |
|
|
| mkdir -p "$OUTPUT_DIR" |
| LOG_FILE="$OUTPUT_DIR/eval_full.log" |
|
|
| |
| echo "" |
| echo "βΆ [1/3] 컀μ€ν
체ν¬ν¬μΈνΈ β HF ν¬λ§· λ³ν..." |
|
|
| if [ ! -f "$HF_MODEL_DIR/config.json" ]; then |
| python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \ |
| --checkpoint "$CHECKPOINT" \ |
| --output "$HF_MODEL_DIR" \ |
| --tokenizer "$TOKENIZER" \ |
| 2>&1 | tee -a "$LOG_FILE" |
| echo "β
HF λ³ν μλ£: $HF_MODEL_DIR" |
| else |
| echo " β³ HF λͺ¨λΈ μ΄λ―Έ μ‘΄μ¬, λ³ν μ€ν΅: $HF_MODEL_DIR" |
| fi |
|
|
| |
| echo "" |
| echo "βΆ [2/3] lm-eval μ 체 νκ° μμ..." |
| echo " β³ λ‘κ·Έ: $LOG_FILE" |
| START_TIME=$(date +%s) |
|
|
| if [ "$USE_MULTI_GPU" = "1" ]; then |
| python3 -m lm_eval \ |
| --model hf \ |
| --model_args "pretrained=$HF_MODEL_DIR,dtype=float16,parallelize=True" \ |
| --tasks "$TASKS" \ |
| --num_fewshot "$NUM_FEWSHOT" \ |
| --batch_size "$BATCH_SIZE" \ |
| --output_path "$OUTPUT_DIR" \ |
| --log_samples \ |
| --verbosity INFO \ |
| 2>&1 | tee -a "$LOG_FILE" |
| else |
| CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" python3 -m lm_eval \ |
| --model hf \ |
| --model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \ |
| --tasks "$TASKS" \ |
| --num_fewshot "$NUM_FEWSHOT" \ |
| --batch_size "$BATCH_SIZE" \ |
| --output_path "$OUTPUT_DIR" \ |
| --log_samples \ |
| --verbosity INFO \ |
| 2>&1 | tee -a "$LOG_FILE" |
| fi |
|
|
| END_TIME=$(date +%s) |
| ELAPSED=$(( END_TIME - START_TIME )) |
| echo "" |
| echo "β
νκ° μλ£! μμ: $((ELAPSED/60))λΆ $((ELAPSED%60))μ΄" |
|
|
| |
| echo "" |
| echo "βΆ [3/3] κ²°κ³Ό 리ν¬νΈ μμ±..." |
|
|
| python3 - "$OUTPUT_DIR" "$CHECKPOINT" <<'PYEOF' |
| import json, glob, sys, os |
| from datetime import datetime |
|
|
| output_dir = sys.argv[1] |
| checkpoint = sys.argv[2] if len(sys.argv) > 2 else "unknown" |
|
|
| results_files = sorted(glob.glob(f"{output_dir}/**/*.json", recursive=True)) |
| results_files = [f for f in results_files if "samples_" not in os.path.basename(f)] |
|
|
| report_lines = [ |
| f"# Ko-LLM Full Eval Report", |
| f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", |
| f"Checkpoint: {checkpoint}", |
| "", |
| ] |
|
|
| all_results = {} |
| for rf in results_files: |
| try: |
| with open(rf) as f: |
| data = json.load(f) |
| results = data.get("results", {}) |
| if results: |
| all_results.update(results) |
| except Exception: |
| pass |
|
|
| |
| kobest_tasks = [k for k in all_results if k.startswith("kobest_")] |
| if kobest_tasks: |
| report_lines.append("## KoBEST") |
| report_lines.append("| Task | Metric | Score |") |
| report_lines.append("|------|--------|-------|") |
| for task in sorted(kobest_tasks): |
| metrics = all_results[task] |
| for key, val in metrics.items(): |
| if "stderr" not in key and isinstance(val, (int, float)): |
| report_lines.append(f"| {task} | {key} | {val:.4f} |") |
|
|
| |
| haerae_tasks = [k for k in all_results if k.startswith("haerae")] |
| if haerae_tasks: |
| report_lines.append("\n## HAE-RAE Bench") |
| report_lines.append("| Task | Metric | Score |") |
| report_lines.append("|------|--------|-------|") |
| for task in sorted(haerae_tasks): |
| metrics = all_results[task] |
| for key, val in metrics.items(): |
| if "stderr" not in key and isinstance(val, (int, float)): |
| report_lines.append(f"| {task} | {key} | {val:.4f} |") |
|
|
| |
| mmlu_top = {k: v for k, v in all_results.items() |
| if k.startswith("global_mmlu_ko") and "_" not in k.replace("global_mmlu_ko", "")} |
| if mmlu_top: |
| report_lines.append("\n## Global MMLU (Korean)") |
| for task, metrics in mmlu_top.items(): |
| for key, val in metrics.items(): |
| if "stderr" not in key and isinstance(val, (int, float)): |
| report_lines.append(f"- {task} {key}: {val:.4f}") |
|
|
| |
| other_tasks = [k for k in all_results |
| if not k.startswith("kobest_") |
| and not k.startswith("haerae") |
| and not k.startswith("global_mmlu_ko")] |
| if other_tasks: |
| report_lines.append("\n## κΈ°ν νμ€ν¬") |
| for task in sorted(other_tasks): |
| metrics = all_results[task] |
| for key, val in metrics.items(): |
| if "stderr" not in key and isinstance(val, (int, float)): |
| report_lines.append(f"- {task} | {key}: {val:.4f}") |
|
|
| report_path = os.path.join(output_dir, "SUMMARY.md") |
| with open(report_path, "w") as f: |
| f.write("\n".join(report_lines)) |
|
|
| print("\n".join(report_lines)) |
| print(f"\nπ 리ν¬νΈ μ μ₯: {report_path}") |
| PYEOF |
|
|
| echo "" |
| echo "==================================================" |
| echo "β
μ 체 νκ° μλ£!" |
| echo " κ²°κ³Ό λλ ν 리: $OUTPUT_DIR" |
| echo " μμ½ λ¦¬ν¬νΈ : $OUTPUT_DIR/SUMMARY.md" |
| echo " μ 체 λ‘κ·Έ : $LOG_FILE" |
| echo " μλ£ μκ° : $(date)" |
| echo "==================================================" |
|
|