| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -euo pipefail |
|
|
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| PROJECT_DIR="$(dirname "$SCRIPT_DIR")" |
|
|
| |
| CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}" |
| TIMESTAMP="$(date +%Y%m%d_%H%M%S)" |
| OUTPUT_DIR="${2:-eval/outputs/quick_${TIMESTAMP}}" |
|
|
| |
| [[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT" |
| [[ "$OUTPUT_DIR" != /* ]] && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR" |
|
|
| |
| HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")" |
| TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json" |
| DEVICE="${CUDA_VISIBLE_DEVICES:-0}" |
| BATCH_SIZE="auto" |
|
|
| |
| TASKS="kobest_boolq,kobest_copa,haerae_general_knowledge,haerae_history,paws_ko" |
|
|
| |
| check_dep() { |
| python3 -c "import $1" 2>/dev/null || { echo "β $1 not found. pip install $2"; exit 1; } |
| } |
| check_dep lm_eval lm-eval |
| check_dep transformers transformers |
| check_dep safetensors safetensors |
|
|
| echo "==================================================" |
| echo " Ko-LLM Quick Eval" |
| echo "==================================================" |
| echo " Checkpoint : $CHECKPOINT" |
| echo " HF output : $HF_MODEL_DIR" |
| echo " Tasks : $TASKS" |
| echo " Output : $OUTPUT_DIR" |
| echo " Device : cuda:$DEVICE" |
| echo "==================================================" |
|
|
| mkdir -p "$OUTPUT_DIR" |
|
|
| |
| if [ ! -f "$HF_MODEL_DIR/config.json" ]; then |
| echo "" |
| echo "βΆ Step 1: 컀μ€ν
체ν¬ν¬μΈνΈ β HF ν¬λ§· λ³ν..." |
| python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \ |
| --checkpoint "$CHECKPOINT" \ |
| --output "$HF_MODEL_DIR" \ |
| --tokenizer "$TOKENIZER" |
| echo "β
HF λ³ν μλ£: $HF_MODEL_DIR" |
| else |
| echo "βΆ Step 1: HF λͺ¨λΈ μ΄λ―Έ μ‘΄μ¬, λ³ν μ€ν΅" |
| echo " $HF_MODEL_DIR" |
| fi |
|
|
| |
| echo "" |
| echo "βΆ Step 2: lm-eval νκ° μμ..." |
| START_TIME=$(date +%s) |
|
|
| CUDA_VISIBLE_DEVICES="$DEVICE" python3 -m lm_eval \ |
| --model hf \ |
| --model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \ |
| --tasks "$TASKS" \ |
| --num_fewshot 0 \ |
| --batch_size "$BATCH_SIZE" \ |
| --output_path "$OUTPUT_DIR" \ |
| --log_samples \ |
| --verbosity INFO \ |
| 2>&1 | tee "$OUTPUT_DIR/eval.log" |
|
|
| END_TIME=$(date +%s) |
| ELAPSED=$(( END_TIME - START_TIME )) |
|
|
| echo "" |
| echo "==================================================" |
| echo "β
νκ° μλ£!" |
| echo " μμμκ°: $((ELAPSED / 60))λΆ $((ELAPSED % 60))μ΄" |
| echo " κ²°κ³Ό μ μ₯: $OUTPUT_DIR" |
| echo "==================================================" |
|
|
| |
| echo "" |
| echo "βΆ Step 3: κ²°κ³Ό μμ½" |
| python3 - <<'PYEOF' |
| import json, glob, sys, os |
|
|
| output_dir = sys.argv[1] if len(sys.argv) > 1 else "." |
| results_files = glob.glob(f"{output_dir}/**/*.json", recursive=True) |
| results_files = [f for f in results_files if "results" in f.lower()] |
|
|
| if not results_files: |
| print("κ²°κ³Ό JSON νμΌ μμ. eval.log νμΈνμΈμ.") |
| sys.exit(0) |
|
|
| for rf in results_files: |
| try: |
| with open(rf) as f: |
| data = json.load(f) |
| results = data.get("results", {}) |
| print(f"\n{'='*50}") |
| print(f"Task Results (from {os.path.basename(rf)})") |
| print(f"{'='*50}") |
| for task, metrics in results.items(): |
| print(f"\n{task}:") |
| for key, val in metrics.items(): |
| if "stderr" not in key and isinstance(val, (int, float)): |
| print(f" {key}: {val:.4f}") |
| except Exception as e: |
| print(f"νμ± μ€ν¨: {rf}: {e}") |
| PYEOF |
| python3 - "$OUTPUT_DIR" <<'PYEOF' |
| import json, glob, sys, os |
| output_dir = sys.argv[1] if len(sys.argv) > 1 else "." |
| results_files = glob.glob(f"{output_dir}/**/*.json", recursive=True) |
| results_files = [f for f in results_files if "results" in os.path.basename(f)] |
| if not results_files: |
| |
| results_files = glob.glob(f"{output_dir}/*.json") |
| for rf in results_files[:3]: |
| try: |
| with open(rf) as f: |
| data = json.load(f) |
| results = data.get("results", {}) |
| print(f"\n{'='*50}\nTask Results: {os.path.basename(rf)}\n{'='*50}") |
| for task, metrics in results.items(): |
| print(f"\n{task}:") |
| for key, val in metrics.items(): |
| if "stderr" not in key and isinstance(val, (int, float)): |
| print(f" {key}: {val:.4f}") |
| except Exception as e: |
| print(f"νμ± μ€ν¨: {rf}: {e}") |
| PYEOF |
|
|