| |
| """ |
| COPA Dataset Evaluation: Raw vs Fine-tuned Model |
| |
| Evaluates models on the COPA (Choice of Plausible Alternatives) dataset. |
| Focuses on tasks where the effect is given and we need to identify the cause. |
| |
| Usage: |
| python evaluate_copa_raw_vs_finetuned.py [--max_samples N] [--batch_size N] [--checkpoint_dir PATH] |
| """ |
|
|
| import os |
| import json |
| import argparse |
| import re |
| from datetime import datetime |
| from tqdm import tqdm |
| import torch |
| from datasets import load_dataset |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| from peft import PeftModel |
| import numpy as np |
| import time |
| from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score |
| import warnings |
| warnings.filterwarnings('ignore') |
|
|
| |
| |
| |
|
|
| |
| RAW_MODEL_PATH = os.environ.get('EVAL_RAW_MODEL_PATH', |
| "/home/moein_salimi/PLLMS/unsloth-Qwen2.5-3B-Instruct-unsloth-bnb-4bit") |
| TRAINING_DIR = os.environ.get('EVAL_TRAINING_DIR', |
| "/home/moein_salimi/users/amirmo/AbductiveReasoning/GRPO/results/dt11.10.16:42_e20_unsloth_Qwen2.5_3B_Instruct_unsloth_bnb_4bit_bnb_4bit_lr1e-05_t0.7_ε0.2_r64_b16") |
| CHECKPOINT_DIR = os.path.join(TRAINING_DIR, "checkpoint") |
| OUTPUT_DIR = os.environ.get('EVAL_OUTPUT_DIR', |
| "/home/moein_salimi/users/amirmo/AbductiveReasoning/GRPO/Evaluation/copa_evaluation_results") |
|
|
| |
| |
| |
|
|
| def find_best_checkpoint(training_dir): |
| """Find the best checkpoint based on validation metrics.""" |
| print("\n📁 Finding best checkpoint...") |
| |
| val_metrics_path = os.path.join(training_dir, "val_metrics.json") |
| checkpoint_dir = os.path.join(training_dir, "checkpoint") |
| |
| if not os.path.exists(val_metrics_path): |
| print(f"⚠️ No val_metrics.json found, using latest checkpoint") |
| checkpoints = [d for d in os.listdir(checkpoint_dir) |
| if d.startswith('checkpoint-') and os.path.isdir(os.path.join(checkpoint_dir, d))] |
| if checkpoints: |
| latest = max(checkpoints, key=lambda x: int(x.split('-')[1])) |
| return os.path.join(checkpoint_dir, latest), 0.0 |
| return None, 0.0 |
| |
| with open(val_metrics_path, 'r') as f: |
| val_metrics = json.load(f) |
| |
| |
| best_epoch = None |
| best_score = 0.0 |
| |
| for epoch_str, metrics in val_metrics.items(): |
| if metrics['avg_reward'] > best_score: |
| best_score = metrics['avg_reward'] |
| best_epoch = float(epoch_str) |
| |
| if best_epoch is None: |
| print("⚠️ No valid metrics found, using latest checkpoint") |
| checkpoints = [d for d in os.listdir(checkpoint_dir) |
| if d.startswith('checkpoint-') and os.path.isdir(os.path.join(checkpoint_dir, d))] |
| if checkpoints: |
| latest = max(checkpoints, key=lambda x: int(x.split('-')[1])) |
| return os.path.join(checkpoint_dir, latest), 0.0 |
| return None, 0.0 |
| |
| |
| checkpoints = [d for d in os.listdir(checkpoint_dir) |
| if d.startswith('checkpoint-') and os.path.isdir(os.path.join(checkpoint_dir, d))] |
| |
| if not checkpoints: |
| return None, 0.0 |
| |
| checkpoint_steps = [(int(cp.split('-')[1]), cp) for cp in checkpoints] |
| checkpoint_steps.sort() |
| |
| max_checkpoint_step = max(checkpoint_steps)[0] |
| estimated_steps_per_epoch = max_checkpoint_step / 20.0 |
| target_step = int(best_epoch * estimated_steps_per_epoch) |
| |
| best_checkpoint = min(checkpoint_steps, key=lambda x: abs(x[0] - target_step)) |
| checkpoint_path = os.path.join(checkpoint_dir, best_checkpoint[1]) |
| |
| print(f"✅ Best checkpoint: {best_checkpoint[1]}") |
| print(f" Validation score: {best_score:.4f} at epoch {best_epoch:.2f}") |
| |
| return checkpoint_path, best_score |
|
|
| def load_raw_model(device): |
| """Load the raw/base model.""" |
| print(f"\n🤖 Loading raw model from: {RAW_MODEL_PATH}") |
| |
| tokenizer = AutoTokenizer.from_pretrained(RAW_MODEL_PATH, trust_remote_code=True) |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| RAW_MODEL_PATH, |
| torch_dtype=torch.float16, |
| device_map={"": f"cuda:0"}, |
| trust_remote_code=True, |
| load_in_4bit=True, |
| ) |
| |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| model.eval() |
| print("✅ Raw model loaded successfully") |
| |
| return model, tokenizer |
|
|
| def load_finetuned_model(checkpoint_path, device): |
| """Load the fine-tuned model with LoRA adapter.""" |
| print(f"\n🎯 Loading fine-tuned model from: {checkpoint_path}") |
| |
| |
| base_tokenizer = AutoTokenizer.from_pretrained(RAW_MODEL_PATH, trust_remote_code=True) |
| |
| base_model = AutoModelForCausalLM.from_pretrained( |
| RAW_MODEL_PATH, |
| torch_dtype=torch.float16, |
| device_map={"": f"cuda:0"}, |
| trust_remote_code=True, |
| load_in_4bit=True, |
| ) |
| |
| |
| model = PeftModel.from_pretrained(base_model, checkpoint_path) |
| |
| if base_tokenizer.pad_token is None: |
| base_tokenizer.pad_token = base_tokenizer.eos_token |
| |
| model.eval() |
| print("✅ Fine-tuned model loaded successfully") |
| |
| return model, base_tokenizer |
|
|
| def create_copa_prompt(premise, choice1, choice2): |
| """Create a prompt for COPA causal reasoning task. |
| |
| Args: |
| premise: The effect that occurred |
| choice1: First possible cause |
| choice2: Second possible cause |
| |
| Returns: |
| system_prompt, user_prompt |
| """ |
| system_prompt = """You are an expert in causal reasoning. Given a cause and two possible effect options, select which option (1 or 2) is the most plausible direct effect. |
| |
| First, think step by step and explain your causal reasoning in just one paragraph. Then decide which option (1 or 2) is better. |
| |
| Your entire output MUST use exactly the following format and nothing else (no text before, between, or after these tags): |
| |
| <reasoning> |
| [here you write your chain-of-thought reasoning about which effect is more plausible] |
| </reasoning> |
| <answer> |
| [here you output ONLY the number 1 or 2] |
| </answer>""" |
|
|
| |
| user_prompt = f"""Cause: {premise} |
| |
| Which of the following is the most plausible EFFECT of this cause? |
| |
| Option 1: {choice1} |
| Option 2: {choice2} |
| |
| Think step by step about which option is the most likely effect, then provide your answer in <answer></answer> tags.""" |
| |
| return system_prompt, user_prompt |
|
|
| def extract_reasoning(response): |
| """Extract chain-of-thought reasoning from <reasoning>...</reasoning> tags, if present.""" |
| match = re.search(r'<reasoning>(.*?)</reasoning>', response, re.IGNORECASE | re.DOTALL) |
| if match: |
| return match.group(1).strip() |
| return None |
|
|
| def extract_answer(response): |
| """Extract answer from model response. |
| |
| Returns: |
| int: 0 or 1 representing the choice (0-indexed), or None if extraction fails |
| """ |
| |
| tag_match = re.search(r'<answer>\s*([12])\s*</answer>', response, re.IGNORECASE) |
| if tag_match: |
| return int(tag_match.group(1)) - 1 |
| |
| |
| |
| answer_pattern = r'<answer>\s*(\d+)\s*</answer>' |
| matches = re.findall(answer_pattern, response, re.IGNORECASE) |
| |
| if matches: |
| answer = matches[-1].strip() |
| if answer in ['1', '2']: |
| return int(answer) - 1 |
| |
| |
| fallback_patterns = [ |
| r'(?:answer|choice)[\s:]+(\d+)', |
| r'option\s+(\d+)', |
| r'(?:^|\s)([12])(?:\s|$|\.|,)' |
| ] |
| |
| for pattern in fallback_patterns: |
| matches = re.findall(pattern, response, re.IGNORECASE) |
| if matches: |
| answer = matches[-1].strip() |
| if answer in ['1', '2']: |
| return int(answer) - 1 |
| |
| return None |
|
|
| def evaluate_on_copa(model, tokenizer, max_samples=None, model_name="Model", batch_size=1, split="validation"): |
| """Evaluate model on COPA dataset (cause questions only).""" |
| print(f"\n🔍 Evaluating {model_name} on COPA dataset (split: {split})...") |
| print(f" Task: Identify the CAUSE given an EFFECT") |
| print(f" Batch size: {batch_size}") |
| |
| |
| print("Loading COPA dataset...") |
| |
| try: |
| dataset = load_dataset("pkavumba/balanced-copa", split="train") |
| print(f"Loaded {len(dataset)} samples from COPA dataset") |
| |
| except Exception as e: |
| print(f"❌ Error loading dataset: {e}") |
| print("\n💡 Make sure you have internet connection and the dataset is accessible.") |
| return None |
| |
| |
| cause_dataset = dataset.filter(lambda x: x['question'] == 'cause') |
| print(f"Filtered to {len(cause_dataset)} 'cause' questions (given effect, find cause)") |
| |
| if len(cause_dataset) == 0: |
| print("❌ No 'cause' questions found in dataset!") |
| return None |
| |
| if max_samples: |
| cause_dataset = cause_dataset.select(range(min(max_samples, len(cause_dataset)))) |
| print(f"Evaluating on {len(cause_dataset)} samples (limited)") |
| else: |
| print(f"Evaluating on {len(cause_dataset)} samples (full filtered {split} set)") |
| |
| results = [] |
| all_true_labels = [] |
| all_pred_labels = [] |
| failed_extractions = 0 |
| |
| |
| num_batches = (len(cause_dataset) + batch_size - 1) // batch_size |
| btime = time.time() |
|
|
| for batch_idx in tqdm(range(num_batches), desc=f"Evaluating {model_name}"): |
| |
| start_idx = batch_idx * batch_size |
| end_idx = min(start_idx + batch_size, len(cause_dataset)) |
| batch = cause_dataset[start_idx:end_idx] |
| |
| |
| if not isinstance(batch['premise'], list): |
| batch = {k: [v] for k, v in batch.items()} |
| |
| batch_size_actual = len(batch['premise']) |
| |
| |
| formatted_prompts = [] |
| true_labels_batch = [] |
| batch_data = [] |
| |
| for i in range(batch_size_actual): |
| premise = batch['premise'][i] |
| choice1 = batch['choice1'][i] |
| choice2 = batch['choice2'][i] |
| true_label = batch['label'][i] |
| |
| |
| system_prompt, user_prompt = create_copa_prompt(premise, choice1, choice2) |
| |
| |
| try: |
| messages = [ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt} |
| ] |
| formatted_prompt = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True |
| ) |
| except: |
| |
| formatted_prompt = f"{system_prompt}\n\n{user_prompt}" |
| |
| formatted_prompts.append(formatted_prompt) |
| true_labels_batch.append(true_label) |
| batch_data.append({ |
| 'premise': premise, |
| 'choice1': choice1, |
| 'choice2': choice2, |
| 'true_label': true_label, |
| 'id': start_idx + i |
| }) |
| |
| |
| inputs = tokenizer( |
| formatted_prompts, |
| return_tensors="pt", |
| padding=True, |
| truncation=True, |
| max_length=512 |
| ) |
| inputs = {k: v.to(model.device) for k, v in inputs.items()} |
| |
| |
| with torch.no_grad(): |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=2048, |
| temperature=0.0, |
| do_sample=False, |
| |
| pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id |
| ) |
| |
| |
| for i in range(len(formatted_prompts)): |
| |
| input_length = inputs['input_ids'][i].shape[0] |
| response = tokenizer.decode(outputs[i][input_length:], skip_special_tokens=True) |
| |
| |
| predicted_label = extract_answer(response) |
| |
| |
| |
| reasoning = response |
| |
| if predicted_label is None: |
| failed_extractions += 1 |
| predicted_label = 0 |
| |
| true_label = true_labels_batch[i] |
| |
| all_true_labels.append(true_label) |
| all_pred_labels.append(predicted_label) |
| |
| |
| is_correct = (predicted_label == true_label) |
| results.append({ |
| 'sample_id': batch_data[i]['id'], |
| 'premise': batch_data[i]['premise'], |
| 'choice1': batch_data[i]['choice1'], |
| 'choice2': batch_data[i]['choice2'], |
| 'true_label': true_label, |
| 'predicted_label': predicted_label, |
| 'reasoning': reasoning, |
| 'correct': is_correct |
| }) |
| |
| etime = time.time() |
| print(f"Batch processing time: {etime - btime:.2f} seconds") |
| |
| all_true_labels = np.array(all_true_labels) |
| all_pred_labels = np.array(all_pred_labels) |
| |
| |
| accuracy = accuracy_score(all_true_labels, all_pred_labels) |
| |
| |
| f1 = f1_score(all_true_labels, all_pred_labels, average='binary', zero_division=0) |
| precision = precision_score(all_true_labels, all_pred_labels, average='binary', zero_division=0) |
| recall = recall_score(all_true_labels, all_pred_labels, average='binary', zero_division=0) |
| |
| |
| correct_count = sum(1 for r in results if r['correct']) |
| |
| |
| extraction_rate = (len(results) - failed_extractions) / len(results) if results else 0.0 |
| |
| print(f"\n📊 {model_name} Results:") |
| print(f" Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%) - {correct_count}/{len(results)} correct") |
| print(f" F1 Score: {f1:.4f}") |
| print(f" Precision: {precision:.4f}") |
| print(f" Recall: {recall:.4f}") |
| print(f" Extraction Rate: {extraction_rate:.4f} ({extraction_rate*100:.2f}%)") |
| print(f" Failed extractions: {failed_extractions}/{len(results)} ({failed_extractions/len(results)*100:.1f}%)") |
| |
| return { |
| 'accuracy': accuracy, |
| 'f1': f1, |
| 'precision': precision, |
| 'recall': recall, |
| 'extraction_rate': extraction_rate, |
| 'correct_count': correct_count, |
| 'total': len(results), |
| 'failed_extractions': failed_extractions, |
| 'time': etime - btime, |
| 'results': results |
| } |
|
|
|
|
|
|
| def evaluate_model_with_dynamic_batch(model, tokenizer, args, model_name): |
| """Evaluate a model with automatic batch-size backoff to avoid CUDA OOM.""" |
| results = None |
| batch_size = args.batch_size |
| |
| while batch_size >= 1 and results is None: |
| try: |
| print(f"\n🧪 Evaluating {model_name} with batch_size={batch_size}") |
| results = evaluate_on_copa( |
| model, |
| tokenizer, |
| args.max_samples, |
| model_name, |
| batch_size, |
| args.split |
| ) |
| print(f"✅ {model_name} evaluation succeeded with batch_size={batch_size}") |
| except torch.cuda.OutOfMemoryError: |
| print(f"⚠️ CUDA OutOfMemoryError at batch_size={batch_size}, halving batch size...") |
| results = None |
| except RuntimeError as e: |
| if "out of memory" in str(e).lower(): |
| print(f"⚠️ RuntimeError OOM at batch_size={batch_size}, halving batch size...") |
| results = None |
| else: |
| raise |
| |
| if results is None: |
| torch.cuda.empty_cache() |
| batch_size = batch_size // 2 |
| |
| if results is None: |
| print(f"❌ {model_name}: still out of memory even with batch_size < 1, giving up.") |
| |
| return results |
|
|
| def ensure_raw_results_cached(args): |
| """ |
| Ensure raw copa results are cached on disk for the current configuration. |
| Returns the loaded or newly computed raw_results dict. |
| """ |
| dataset_name = "copa" |
| split = args.split |
| sample_tag = f"max{args.max_samples}" if args.max_samples else "all" |
| |
| raw_results_dir = os.path.join(OUTPUT_DIR, "raw_model", dataset_name) |
| os.makedirs(raw_results_dir, exist_ok=True) |
| |
| raw_results_file = os.path.join( |
| raw_results_dir, |
| f"raw_results_train_all.json" |
| ) |
| |
| if os.path.exists(raw_results_file): |
| print(f"\n📂 Found cached raw model results: {raw_results_file}") |
| with open(raw_results_file, "r") as f: |
| raw_results = json.load(f) |
| return raw_results |
| |
| print("\n🔁 No cached raw model results found for this configuration.") |
| print(" Running raw model once and caching per-sample results...") |
| |
| raw_model, raw_tokenizer = load_raw_model(args.cuda_device) |
| raw_results = evaluate_model_with_dynamic_batch( |
| raw_model, raw_tokenizer, args, "Raw Model (cached)" |
| ) |
| del raw_model |
| torch.cuda.empty_cache() |
| |
| if raw_results is None: |
| print("❌ Failed to compute raw model results; cannot cache.") |
| return None |
| |
| raw_results_with_meta = { |
| "model_path": RAW_MODEL_PATH, |
| "dataset": dataset_name, |
| "split": split, |
| "max_samples": args.max_samples, |
| **raw_results |
| } |
| |
| with open(raw_results_file, "w") as f: |
| json.dump(raw_results_with_meta, f, indent=2) |
| print(f"💾 Cached raw model results saved to: {raw_results_file}") |
| |
| return raw_results_with_meta |
|
|
| def ensure_finetuned_results_cached(args, ckpt_name): |
| """ |
| Ensure fine-tuned model results are cached on disk for the current configuration. |
| Returns the loaded or newly computed fine-tuned results dict. |
| """ |
| dataset_name = "copa" |
| ckpt_output_dir = os.path.join("/".join(OUTPUT_DIR.split("/")[:]), args.run, ckpt_name, dataset_name) |
| if os.path.exists(ckpt_output_dir) and os.path.exists(os.path.join(ckpt_output_dir, "disagreement_cases.json")) and os.path.exists(os.path.join(ckpt_output_dir, "all_cases.json")): |
| print(f"\n📂 Found cached fine-tuned model results: {ckpt_output_dir}") |
| return True |
| |
| print("\n🔁 No cached fine-tuned model results found for this configuration.") |
| return False |
|
|
|
|
| def evaluate_checkpoint_cases(args, checkpoint_path): |
| """ |
| Given a single checkpoint, evaluate it vs cached raw results and save: |
| - all_cases.json |
| - disagreement_cases.json |
| under: OUTPUT_DIR/<checkpoint_name>/copa/ |
| """ |
| print(f"\n📁 Checkpoint path argument received: {checkpoint_path}") |
| if not os.path.isabs(checkpoint_path): |
| checkpoint_path = os.path.abspath(checkpoint_path) |
| print(f" Converted to absolute path: {checkpoint_path}") |
| |
| if not os.path.exists(checkpoint_path): |
| print(f"❌ Error: Checkpoint path does not exist: {checkpoint_path}") |
| print(f" Please check the path and try again.") |
| return |
| |
| ckpt_name = os.path.basename(checkpoint_path.rstrip("/")) |
| print(f"✅ Using checkpoint for per-case evaluation: {ckpt_name}") |
| |
| |
| raw_results = ensure_raw_results_cached(args) |
| if raw_results is None: |
| print("❌ Cannot evaluate checkpoint without raw model results.") |
| return |
|
|
| |
| if ensure_finetuned_results_cached(args, ckpt_name): |
| print(f"✅ Using cached fine-tuned model results for per-case evaluation: {ckpt_name}") |
| return |
| |
| |
| finetuned_model, finetuned_tokenizer = load_finetuned_model(checkpoint_path, args.cuda_device) |
| finetuned_results = evaluate_model_with_dynamic_batch( |
| finetuned_model, |
| finetuned_tokenizer, |
| args, |
| f"Fine-tuned Model ({ckpt_name})" |
| ) |
| del finetuned_model |
| torch.cuda.empty_cache() |
| |
| if finetuned_results is None: |
| print("❌ Fine-tuned model evaluation failed; aborting.") |
| return |
| |
| |
| dataset_name = "copa" |
| ckpt_output_dir = os.path.join("/".join(OUTPUT_DIR.split("/")[:]), args.run, ckpt_name, dataset_name) |
| os.makedirs(ckpt_output_dir, exist_ok=True) |
| |
| raw_by_id = {idx + 1: r for idx, r in enumerate(raw_results["results"])} |
| ft_by_id = {idx + 1: r for idx, r in enumerate(finetuned_results["results"])} |
| |
| disagreement_cases = [] |
| |
| for pid, raw_r in raw_by_id.items(): |
| if pid not in ft_by_id: |
| continue |
| ft_r = ft_by_id[pid] |
| |
| case_entry = { |
| "problem_id": pid, |
| "premise": raw_r["premise"], |
| "choice1": raw_r["choice1"], |
| "choice2": raw_r["choice2"], |
| "true_label": raw_r["true_label"], |
| "raw": { |
| "predicted_label": raw_r["predicted_label"], |
| "reasoning": raw_r["reasoning"], |
| "correct": raw_r["correct"] |
| }, |
| "finetuned": { |
| "predicted_label": ft_r["predicted_label"], |
| "reasoning": ft_r["reasoning"], |
| "correct": ft_r["correct"] |
| } |
| } |
| |
| if raw_r["correct"] == ft_r["correct"]: |
| continue |
| |
| if raw_r["correct"] and not ft_r["correct"]: |
| disagreement_type = "raw_correct_finetuned_wrong" |
| else: |
| disagreement_type = "finetuned_correct_raw_wrong" |
| |
| disagreement_cases.append({ |
| **case_entry, |
| "disagreement_type": disagreement_type |
| }) |
| |
| disagreement_file = os.path.join(ckpt_output_dir, "disagreement_cases.json") |
| with open(disagreement_file, "w") as f: |
| json.dump(disagreement_cases, f, indent=2) |
| print(f"💾 Disagreement cases saved to: {disagreement_file}") |
| |
| finetune_results_with_meta = { |
| "dataset": dataset_name, |
| "max_samples": args.max_samples, |
| **finetuned_results |
| } |
| |
| finetune_results_file = os.path.join(ckpt_output_dir, "all_cases.json") |
| with open(finetune_results_file, "w") as f: |
| json.dump(finetune_results_with_meta, f, indent=2) |
| print(f"💾 finetune model results saved to: {finetune_results_file}") |
| |
| return { |
| "raw_results": raw_results, |
| "finetuned_results": finetuned_results, |
| "all_cases_file": finetune_results_file, |
| "disagreement_file": disagreement_file |
| } |
|
|
|
|
| def save_results(raw_results, finetuned_results, best_checkpoint_info, output_dir): |
| """Save evaluation results to JSON files.""" |
| os.makedirs(output_dir, exist_ok=True) |
| |
| timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") |
| |
| |
| raw_output = { |
| 'model': RAW_MODEL_PATH, |
| 'evaluation_time': timestamp, |
| 'dataset': 'COPA (cause questions only)', |
| 'metrics': { |
| 'accuracy': raw_results['accuracy'], |
| 'f1': raw_results['f1'], |
| 'precision': raw_results['precision'], |
| 'recall': raw_results['recall'], |
| 'extraction_rate': raw_results['extraction_rate'] |
| }, |
| 'correct_count': raw_results['correct_count'], |
| 'total': raw_results['total'], |
| 'failed_extractions': raw_results['failed_extractions'], |
| 'detailed_results': raw_results['results'][:100] |
| } |
| |
| raw_file = os.path.join(output_dir, f"raw_model_copa_results_{timestamp}.json") |
| with open(raw_file, 'w') as f: |
| json.dump(raw_output, f, indent=2) |
| print(f"\n💾 Raw model results saved to: {raw_file}") |
| |
| |
| finetuned_output = { |
| 'base_model': RAW_MODEL_PATH, |
| 'checkpoint': best_checkpoint_info['path'], |
| 'validation_score': best_checkpoint_info['score'], |
| 'evaluation_time': timestamp, |
| 'dataset': 'COPA (cause questions only)', |
| 'metrics': { |
| 'accuracy': finetuned_results['accuracy'], |
| 'f1': finetuned_results['f1'], |
| 'precision': finetuned_results['precision'], |
| 'recall': finetuned_results['recall'], |
| 'extraction_rate': finetuned_results['extraction_rate'] |
| }, |
| 'correct_count': finetuned_results['correct_count'], |
| 'total': finetuned_results['total'], |
| 'failed_extractions': finetuned_results['failed_extractions'], |
| 'detailed_results': finetuned_results['results'][:100] |
| } |
| |
| finetuned_file = os.path.join(output_dir, f"finetuned_model_copa_results_{timestamp}.json") |
| with open(finetuned_file, 'w') as f: |
| json.dump(finetuned_output, f, indent=2) |
| print(f"💾 Fine-tuned model results saved to: {finetuned_file}") |
| |
| |
| improvement_acc = finetuned_results['accuracy'] - raw_results['accuracy'] |
| improvement_f1 = finetuned_results['f1'] - raw_results['f1'] |
| |
| summary = { |
| 'evaluation_time': timestamp, |
| 'dataset': 'COPA (cause questions only)', |
| 'split': 'validation', |
| 'num_samples': raw_results['total'], |
| 'raw_model': { |
| 'path': RAW_MODEL_PATH, |
| 'metrics': { |
| 'accuracy': raw_results['accuracy'], |
| 'f1': raw_results['f1'], |
| 'precision': raw_results['precision'], |
| 'recall': raw_results['recall'], |
| 'extraction_rate': raw_results['extraction_rate'] |
| } |
| }, |
| 'finetuned_model': { |
| 'base_model': RAW_MODEL_PATH, |
| 'checkpoint': best_checkpoint_info['path'], |
| 'validation_score': best_checkpoint_info['score'], |
| 'metrics': { |
| 'accuracy': finetuned_results['accuracy'], |
| 'f1': finetuned_results['f1'], |
| 'precision': finetuned_results['precision'], |
| 'recall': finetuned_results['recall'], |
| 'extraction_rate': finetuned_results['extraction_rate'] |
| } |
| }, |
| 'comparison': { |
| 'accuracy_improvement': improvement_acc, |
| 'f1_improvement': improvement_f1, |
| 'overall_improved': improvement_acc > 0 |
| } |
| } |
| |
| summary_file = os.path.join(output_dir, f"copa_comparison_summary_{timestamp}.json") |
| with open(summary_file, 'w') as f: |
| json.dump(summary, f, indent=2) |
| print(f"💾 Comparison summary saved to: {summary_file}") |
| |
| |
| raw_by_id = {idx+1: r for idx, r in enumerate(raw_results['results'])} |
| ft_by_id = {idx+1: r for idx, r in enumerate(finetuned_results['results'])} |
| |
| disagreement_cases, all_cases = [], [] |
| |
| for pid, raw_r in raw_by_id.items(): |
| if pid not in ft_by_id: |
| continue |
| ft_r = ft_by_id[pid] |
| |
| all_cases.append({ |
| "problem_id": pid, |
| "premise": raw_r["premise"], |
| "choice1": raw_r["choice1"], |
| "choice2": raw_r["choice2"], |
| "true_label": raw_r["true_label"], |
| "raw": { |
| "predicted_label": raw_r["predicted_label"], |
| "reasoning": raw_r["reasoning"], |
| "correct": raw_r["correct"] |
| }, |
| "finetuned": { |
| "predicted_label": ft_r["predicted_label"], |
| "reasoning": ft_r["reasoning"], |
| "correct": ft_r["correct"] |
| } |
| }) |
| |
| if raw_r['correct'] == ft_r['correct']: |
| continue |
| |
| if raw_r['correct'] and not ft_r['correct']: |
| disagreement_type = "raw_correct_finetuned_wrong" |
| else: |
| disagreement_type = "finetuned_correct_raw_wrong" |
| |
| disagreement_cases.append({ |
| "problem_id": pid, |
| "premise": raw_r["premise"], |
| "choice1": raw_r["choice1"], |
| "choice2": raw_r["choice2"], |
| "true_label": raw_r["true_label"], |
| "raw": { |
| "predicted_label": raw_r["predicted_label"], |
| "reasoning": raw_r["reasoning"], |
| "correct": raw_r["correct"] |
| }, |
| "finetuned": { |
| "predicted_label": ft_r["predicted_label"], |
| "reasoning": ft_r["reasoning"], |
| "correct": ft_r["correct"] |
| }, |
| "disagreement_type": disagreement_type |
| }) |
| |
| disagreement_file = os.path.join(output_dir, f"disagreement_cases_{timestamp}.json") |
| with open(disagreement_file, "w") as f: |
| json.dump(disagreement_cases, f, indent=2) |
| print(f"💾 Disagreement cases saved to: {disagreement_file}") |
| |
| all_cases_file = os.path.join(output_dir, f"all_cases_{timestamp}.json") |
| with open(all_cases_file, "w") as f: |
| json.dump(all_cases, f, indent=2) |
| print(f"💾 All cases saved to: {all_cases_file}") |
| |
| return summary |
|
|
| def evaluate_all_checkpoints(args): |
| """Evaluate all checkpoints in a directory.""" |
| checkpoint_dir = args.checkpoint_dir |
| |
| |
| if not os.path.isabs(checkpoint_dir): |
| checkpoint_dir = os.path.abspath(checkpoint_dir) |
| |
| if not os.path.exists(checkpoint_dir): |
| print(f"❌ Error: Checkpoint directory does not exist: {checkpoint_dir}") |
| return |
| |
| print("="*80) |
| print("🚀 COPA EVALUATION: ALL CHECKPOINTS") |
| print("="*80) |
| print(f"Checkpoint Directory: {checkpoint_dir}") |
| print(f"CUDA Device: {args.cuda_device}") |
| print(f"Batch Size: {args.batch_size}") |
| print(f"Split: {args.split}") |
| if args.max_samples: |
| print(f"Max Samples: {args.max_samples}") |
| print("="*80) |
| |
| |
| all_items = os.listdir(checkpoint_dir) |
| checkpoint_dirs = [ |
| d for d in all_items |
| if d.startswith('checkpoint-') and os.path.isdir(os.path.join(checkpoint_dir, d)) |
| ] |
| |
| if not checkpoint_dirs: |
| print(f"❌ No checkpoint directories found in: {checkpoint_dir}") |
| print(f" Looking for directories named 'checkpoint-*'") |
| return |
| |
| |
| checkpoint_dirs.sort(key=lambda x: int(x.split('-')[1])) |
| |
| print(f"\n📁 Found {len(checkpoint_dirs)} checkpoints:") |
| for ckpt in checkpoint_dirs: |
| print(f" - {ckpt}") |
| print() |
| |
| |
| raw_results = None |
| if not args.skip_raw: |
| print("\n" + "="*80) |
| print("🤖 EVALUATING RAW MODEL (once)") |
| print("="*80) |
| raw_model, raw_tokenizer = load_raw_model(args.cuda_device) |
| raw_results = evaluate_on_copa(raw_model, raw_tokenizer, args.max_samples, "Raw Model", args.batch_size, args.split) |
| if raw_results is None: |
| print("❌ Failed to evaluate raw model") |
| return |
| del raw_model |
| torch.cuda.empty_cache() |
| print(f"\n✅ Raw model evaluation complete") |
| print(f" Accuracy: {raw_results['accuracy']:.4f}") |
| print(f" F1 Score: {raw_results['f1']:.4f}") |
| |
| |
| timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
| |
| summary_data = { |
| 'evaluation_time': timestamp, |
| 'dataset': 'COPA (cause questions only)', |
| 'split': args.split, |
| 'checkpoint_directory': checkpoint_dir, |
| 'num_checkpoints_evaluated': len(checkpoint_dirs), |
| 'raw_model': { |
| 'path': RAW_MODEL_PATH, |
| 'results': {k: v for k, v in raw_results.items() if k != 'results'} if raw_results else 'not_evaluated' |
| }, |
| 'checkpoints': [] |
| } |
| |
| summary_file = os.path.join(OUTPUT_DIR, f"copa_all_checkpoints_summary_{timestamp}.json") |
| with open(summary_file, 'w') as f: |
| json.dump(summary_data, f, indent=2) |
| |
| |
| all_checkpoint_results = [] |
| |
| for i, ckpt_name in enumerate(checkpoint_dirs, 1): |
| checkpoint_path = os.path.join(checkpoint_dir, ckpt_name) |
| |
| print("\n" + "="*80) |
| print(f"🎯 EVALUATING CHECKPOINT {i}/{len(checkpoint_dirs)}: {ckpt_name}") |
| print("="*80) |
| |
| try: |
| |
| finetuned_model, finetuned_tokenizer = load_finetuned_model(checkpoint_path, args.cuda_device) |
| finetuned_results = evaluate_on_copa( |
| finetuned_model, finetuned_tokenizer, args.max_samples, |
| f"{ckpt_name}", args.batch_size, args.split |
| ) |
| |
| if finetuned_results is None: |
| print(f"❌ Failed to evaluate {ckpt_name}") |
| continue |
| |
| del finetuned_model |
| torch.cuda.empty_cache() |
| |
| |
| checkpoint_info = { |
| 'checkpoint_name': ckpt_name, |
| 'checkpoint_path': checkpoint_path, |
| 'results': finetuned_results |
| } |
| summary_data["checkpoints"].append( |
| { |
| 'name': checkpoint_info['checkpoint_name'], |
| 'path': checkpoint_info['checkpoint_path'], |
| 'metrics': { |
| 'accuracy': checkpoint_info['results']['accuracy'], |
| 'f1': checkpoint_info['results']['f1'], |
| 'precision': checkpoint_info['results']['precision'], |
| 'recall': checkpoint_info['results']['recall'] |
| } |
| }) |
| |
| with open(summary_file, 'w') as f: |
| json.dump(summary_data, f, indent=2) |
| |
| all_checkpoint_results.append(checkpoint_info) |
| |
| print(f"\n✅ {ckpt_name} evaluation complete") |
| print(f" Accuracy: {finetuned_results['accuracy']:.4f} ({finetuned_results['accuracy']*100:.2f}%)") |
| print(f" F1 Score: {finetuned_results['f1']:.4f}") |
| |
| |
| if raw_results: |
| acc_improvement = finetuned_results['accuracy'] - raw_results['accuracy'] |
| f1_improvement = finetuned_results['f1'] - raw_results['f1'] |
| print(f" 📈 Improvement vs Raw: Acc {acc_improvement:+.4f}, F1 {f1_improvement:+.4f}") |
| |
| except Exception as e: |
| print(f"❌ Error evaluating {ckpt_name}: {e}") |
| import traceback |
| traceback.print_exc() |
| continue |
| |
| |
| print("\n" + "="*80) |
| print("📊 SUMMARY: ALL CHECKPOINTS COMPARISON (COPA)") |
| print("="*80) |
| |
| if raw_results: |
| print(f"\n🤖 RAW MODEL:") |
| print(f" Accuracy: {raw_results['accuracy']:.4f}") |
| print(f" F1 Score: {raw_results['f1']:.4f}") |
| |
| print(f"\n🎯 FINE-TUNED CHECKPOINTS:") |
| if raw_results: |
| print(f" {'Checkpoint':<20} {'Accuracy':<15} {'F1 Score':<15} {'Acc Δ':<12} {'F1 Δ':<12}") |
| print(f" {'-'*80}") |
| |
| for checkpoint_info in all_checkpoint_results: |
| res = checkpoint_info['results'] |
| acc_delta = res['accuracy'] - raw_results['accuracy'] |
| f1_delta = res['f1'] - raw_results['f1'] |
| |
| print(f" {checkpoint_info['checkpoint_name']:<20} " |
| f"{res['accuracy']:.4f} " |
| f"{res['f1']:.4f} " |
| f"{acc_delta:+.4f} " |
| f"{f1_delta:+.4f}") |
| |
| |
| if all_checkpoint_results: |
| best_ckpt = max(all_checkpoint_results, key=lambda x: x['results']['accuracy']) |
| print(f"\n🏆 BEST CHECKPOINT: {best_ckpt['checkpoint_name']}") |
| print(f" Accuracy: {best_ckpt['results']['accuracy']:.4f}") |
| print(f" F1 Score: {best_ckpt['results']['f1']:.4f}") |
| |
| print(f"\n💾 All results saved to: {summary_file}") |
| print("="*80 + "\n") |
|
|
| def print_comparison(summary): |
| """Print formatted comparison results.""" |
| print("\n" + "="*80) |
| print("📊 COPA EVALUATION: RAW vs FINE-TUNED MODEL") |
| print("="*80) |
| |
| raw_metrics = summary['raw_model']['metrics'] |
| ft_metrics = summary['finetuned_model']['metrics'] |
| |
| print("\n🤖 RAW MODEL:") |
| print(f" Accuracy: {raw_metrics['accuracy']:.4f} ({raw_metrics['accuracy']*100:.2f}%)") |
| print(f" F1 Score: {raw_metrics['f1']:.4f}") |
| print(f" Precision: {raw_metrics['precision']:.4f}") |
| print(f" Recall: {raw_metrics['recall']:.4f}") |
| |
| print("\n🎯 FINE-TUNED MODEL:") |
| print(f" Checkpoint: {os.path.basename(summary['finetuned_model']['checkpoint'])}") |
| print(f" Accuracy: {ft_metrics['accuracy']:.4f} ({ft_metrics['accuracy']*100:.2f}%)") |
| print(f" F1 Score: {ft_metrics['f1']:.4f}") |
| print(f" Precision: {ft_metrics['precision']:.4f}") |
| print(f" Recall: {ft_metrics['recall']:.4f}") |
| |
| print("\n📈 IMPROVEMENTS:") |
| comp = summary['comparison'] |
| acc_imp = comp['accuracy_improvement'] |
| f1_imp = comp['f1_improvement'] |
| |
| print(f" Accuracy: {acc_imp:+.4f} ({acc_imp*100:+.2f}%)") |
| print(f" F1 Score: {f1_imp:+.4f} ({f1_imp*100:+.2f}%)") |
| |
| print("\n" + "-"*80) |
| |
| if comp['overall_improved']: |
| print("✅ RESULT: Fine-tuning IMPROVED causal reasoning performance!") |
| print(f" • Accuracy improved by {acc_imp:.4f}") |
| else: |
| print("⚠️ RESULT: Fine-tuning did not improve causal reasoning.") |
| |
| print("="*80 + "\n") |
|
|
| def main(): |
| global RAW_MODEL_PATH, OUTPUT_DIR |
| parser = argparse.ArgumentParser(description='Evaluate raw vs fine-tuned model on COPA dataset') |
| parser.add_argument('--max_samples', type=int, default=None, |
| help='Maximum number of samples to evaluate (default: all samples)') |
| parser.add_argument('--cuda_device', type=str, default='0', |
| help='CUDA device to use (default: 0)') |
| parser.add_argument('--batch_size', type=int, default=4, |
| help='Batch size for evaluation (default: 4)') |
| parser.add_argument('--split', type=str, default='validation', choices=['train', 'test', 'validation'], |
| help='Dataset split to use (default: validation)') |
| parser.add_argument('--skip_raw', action='store_true', |
| help='Skip raw model evaluation') |
| parser.add_argument('--skip_finetuned', action='store_true', |
| help='Skip fine-tuned model evaluation') |
| parser.add_argument('--checkpoint_path', type=str, default=None, |
| help='Path to specific checkpoint to evaluate') |
| parser.add_argument('--checkpoint_dir', type=str, default=None, |
| help='Path to directory containing multiple checkpoints') |
| parser.add_argument('--evaluate_checkpoints', type=int, default=0, |
| help='If set to 1, run per-checkpoint mode: ' |
| 'evaluate the given --checkpoint_path vs cached raw results and ' |
| 'save all_cases/disagreement_cases under OUTPUT_DIR/checkpoint/dataset_name.') |
| parser.add_argument('--run', type=str, default="run", |
| help='Which training run to use for the output directory.') |
| parser.add_argument('--raw_path', type=str, default=None, |
| help='The raw model path') |
| parser.add_argument('--output_path', type=str, default=OUTPUT_DIR, |
| help='Model output path, defaults to env variable.') |
| |
| args = parser.parse_args() |
|
|
| OUTPUT_DIR = args.output_path |
| |
| |
| if args.checkpoint_path and args.checkpoint_dir: |
| print("❌ Error: Cannot use both --checkpoint_path and --checkpoint_dir") |
| print(" Use --checkpoint_path for a single checkpoint") |
| print(" Use --checkpoint_dir to evaluate all checkpoints in a directory") |
| return |
| |
| if args.evaluate_checkpoints == 1 and args.checkpoint_dir: |
| print("❌ Error: --evaluate_checkpoints 1 is only supported with --checkpoint_path (single checkpoint).") |
| print(" Please pass a single --checkpoint_path, or omit --evaluate_checkpoints to use --checkpoint_dir.") |
| return |
| |
| |
| os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_device |
| |
| if args.raw_path: |
| RAW_MODEL_PATH = args.raw_path |
|
|
| |
| if args.evaluate_checkpoints == 1: |
| if not args.checkpoint_path: |
| print("❌ Error: --evaluate_checkpoints 1 requires --checkpoint_path to be set.") |
| return |
| |
| print("="*80) |
| print("🚀 Copa PER-CHECKPOINT EVALUATION MODE") |
| print("="*80) |
| print(f"Raw Model: {RAW_MODEL_PATH}") |
| print(f"Output Dir: {OUTPUT_DIR}") |
| print(f"CUDA Device: {args.cuda_device}") |
| print(f"Split: {args.split}") |
| if args.max_samples: |
| print(f"Max Samples: {args.max_samples}") |
| print(f"Checkpoint: {args.checkpoint_path}") |
| print("="*80) |
| |
| evaluate_checkpoint_cases(args, args.checkpoint_path) |
| print(f"\n✅ Per-checkpoint evaluation finished for: {args.checkpoint_path}") |
| print(f" Results root directory: {OUTPUT_DIR}") |
| return |
| |
| |
| if args.checkpoint_dir: |
| evaluate_all_checkpoints(args) |
| return |
| |
| print("="*70) |
| print("🚀 COPA EVALUATION: RAW vs FINE-TUNED") |
| print("="*70) |
| print(f"Raw Model: {RAW_MODEL_PATH}") |
| print(f"Training Dir: {TRAINING_DIR}") |
| print(f"CUDA Device: {args.cuda_device}") |
| print(f"Batch Size: {args.batch_size}") |
| print(f"Split: {args.split}") |
| print(f"Task: Identify CAUSE given EFFECT") |
| if args.max_samples: |
| print(f"Max Samples: {args.max_samples}") |
| print("="*70) |
| |
| |
| if not args.skip_finetuned: |
| if args.checkpoint_path: |
| checkpoint_path = args.checkpoint_path |
| if not os.path.isabs(checkpoint_path): |
| checkpoint_path = os.path.abspath(checkpoint_path) |
| |
| if not os.path.exists(checkpoint_path): |
| print(f"❌ Error: Checkpoint path does not exist: {checkpoint_path}") |
| return |
| |
| print(f"✅ Using user-specified checkpoint: {os.path.basename(checkpoint_path)}") |
| best_checkpoint_info = { |
| 'path': checkpoint_path, |
| 'score': 'N/A (manually specified)' |
| } |
| else: |
| best_checkpoint_path, best_score = find_best_checkpoint(TRAINING_DIR) |
| if best_checkpoint_path is None: |
| print("❌ No valid checkpoint found!") |
| return |
| best_checkpoint_info = { |
| 'path': best_checkpoint_path, |
| 'score': best_score |
| } |
| else: |
| best_checkpoint_info = None |
| |
| |
| if not args.skip_raw: |
| raw_model, raw_tokenizer = load_raw_model(args.cuda_device) |
| raw_results = evaluate_on_copa(raw_model, raw_tokenizer, args.max_samples, "Raw Model", args.batch_size, args.split) |
| if raw_results is None: |
| print("❌ Failed to evaluate raw model") |
| return |
| del raw_model |
| torch.cuda.empty_cache() |
| else: |
| raw_results = None |
| print("\n⏭️ Skipping raw model evaluation") |
| |
| |
| if not args.skip_finetuned: |
| finetuned_model, finetuned_tokenizer = load_finetuned_model(best_checkpoint_info['path'], args.cuda_device) |
| finetuned_results = evaluate_on_copa(finetuned_model, finetuned_tokenizer, args.max_samples, "Fine-tuned Model", args.batch_size, args.split) |
| if finetuned_results is None: |
| print("❌ Failed to evaluate fine-tuned model") |
| return |
| del finetuned_model |
| torch.cuda.empty_cache() |
| else: |
| finetuned_results = None |
| print("\n⏭️ Skipping fine-tuned model evaluation") |
| |
| |
| if raw_results and finetuned_results: |
| summary = save_results(raw_results, finetuned_results, best_checkpoint_info, OUTPUT_DIR) |
| print_comparison(summary) |
| |
| print(f"\n✅ All results saved to: {OUTPUT_DIR}") |
|
|
| if __name__ == '__main__': |
| main() |
|
|