SFT_Dataset / Evaluation /evaluate_neulr_deductive_raw_vs_finetuned.py

Upload folder using huggingface_hub

e6fad38 verified 5 months ago

46.6 kB

	#!/usr/bin/env python3
	"""
	neulr_deductive Dataset Evaluation: Raw vs Fine-tuned Model

	Evaluates models on the neulr_deductive math competition dataset.
	neulr_deductive answers are integers from 0-999.

	Usage:
	python evaluate_neulr_deductive_raw_vs_finetuned.py [--max_samples N] [--batch_size N] [--checkpoint_dir PATH]
	"""

	import os
	import json
	import argparse
	import re
	from datetime import datetime
	from tqdm import tqdm
	import torch
	from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from peft import PeftModel
	import numpy as np
	import time
	import warnings
	warnings.filterwarnings('ignore')

	# ============================================================================
	# Configuration
	# ============================================================================

	# Allow path injection from orchestrator
	RAW_MODEL_PATH = os.environ.get('EVAL_RAW_MODEL_PATH',
	"/home/moein_salimi/PLLMS/unsloth-Qwen2.5-3B-Instruct-unsloth-bnb-4bit")
	TRAINING_DIR = os.environ.get('EVAL_TRAINING_DIR',
	"/home/moein_salimi/users/amirmo/AbductiveReasoning/GRPO/results/dt11.10.16:42_e20_unsloth_Qwen2.5_3B_Instruct_unsloth_bnb_4bit_bnb_4bit_lr1e-05_t0.7_ε0.2_r64_b16")
	CHECKPOINT_DIR = os.path.join(TRAINING_DIR, "checkpoint")
	OUTPUT_DIR = os.environ.get('EVAL_OUTPUT_DIR',
	"/home/moein_salimi/users/amirmo/AbductiveReasoning/GRPO/Evaluation/neulr_deductive_evaluation_results") # Change default per script

	# ============================================================================
	# Helper Functions
	# ============================================================================

	def find_best_checkpoint(training_dir):
	"""Find the best checkpoint based on validation metrics."""
	print("\n📁 Finding best checkpoint...")

	val_metrics_path = os.path.join(training_dir, "val_metrics.json")
	checkpoint_dir = os.path.join(training_dir, "checkpoint")

	if not os.path.exists(val_metrics_path):
	print(f"⚠️ No val_metrics.json found, using latest checkpoint")
	checkpoints = [d for d in os.listdir(checkpoint_dir)
	if d.startswith('checkpoint-') and os.path.isdir(os.path.join(checkpoint_dir, d))]
	if checkpoints:
	latest = max(checkpoints, key=lambda x: int(x.split('-')[1]))
	return os.path.join(checkpoint_dir, latest), 0.0
	return None, 0.0

	with open(val_metrics_path, 'r') as f:
	val_metrics = json.load(f)

	# Find epoch with highest avg_reward
	best_epoch = None
	best_score = 0.0

	for epoch_str, metrics in val_metrics.items():
	if metrics['avg_reward'] > best_score:
	best_score = metrics['avg_reward']
	best_epoch = float(epoch_str)

	if best_epoch is None:
	print("⚠️ No valid metrics found, using latest checkpoint")
	checkpoints = [d for d in os.listdir(checkpoint_dir)
	if d.startswith('checkpoint-') and os.path.isdir(os.path.join(checkpoint_dir, d))]
	if checkpoints:
	latest = max(checkpoints, key=lambda x: int(x.split('-')[1]))
	return os.path.join(checkpoint_dir, latest), 0.0
	return None, 0.0

	# Find closest checkpoint
	checkpoints = [d for d in os.listdir(checkpoint_dir)
	if d.startswith('checkpoint-') and os.path.isdir(os.path.join(checkpoint_dir, d))]

	if not checkpoints:
	return None, 0.0

	checkpoint_steps = [(int(cp.split('-')[1]), cp) for cp in checkpoints]
	checkpoint_steps.sort()

	max_checkpoint_step = max(checkpoint_steps)[0]
	estimated_steps_per_epoch = max_checkpoint_step / 20.0
	target_step = int(best_epoch * estimated_steps_per_epoch)

	best_checkpoint = min(checkpoint_steps, key=lambda x: abs(x[0] - target_step))
	checkpoint_path = os.path.join(checkpoint_dir, best_checkpoint[1])

	print(f"✅ Best checkpoint: {best_checkpoint[1]}")
	print(f" Validation score: {best_score:.4f} at epoch {best_epoch:.2f}")

	return checkpoint_path, best_score

	def load_raw_model(device):
	"""Load the raw/base model."""
	print(f"\n🤖 Loading raw model from: {RAW_MODEL_PATH}")

	tokenizer = AutoTokenizer.from_pretrained(RAW_MODEL_PATH, trust_remote_code=True)

	model = AutoModelForCausalLM.from_pretrained(
	RAW_MODEL_PATH,
	torch_dtype=torch.float16,
	device_map={"": f"cuda:0"},
	trust_remote_code=True,
	load_in_4bit=True,
	)

	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	model.eval()
	print("✅ Raw model loaded successfully")

	return model, tokenizer

	def load_finetuned_model(checkpoint_path, device):
	"""Load the fine-tuned model with LoRA adapter."""
	print(f"\n🎯 Loading fine-tuned model from: {checkpoint_path}")

	# Load base model
	base_tokenizer = AutoTokenizer.from_pretrained(RAW_MODEL_PATH, trust_remote_code=True)

	base_model = AutoModelForCausalLM.from_pretrained(
	RAW_MODEL_PATH,
	torch_dtype=torch.float16,
	device_map={"": f"cuda:0"},
	trust_remote_code=True,
	load_in_4bit=True,
	)

	# Load LoRA adapter
	model = PeftModel.from_pretrained(base_model, checkpoint_path)

	if base_tokenizer.pad_token is None:
	base_tokenizer.pad_token = base_tokenizer.eos_token

	model.eval()
	print("✅ Fine-tuned model loaded successfully")

	return model, base_tokenizer


	def create_neulr_deductive_prompt(problem, context):
	"""Create a prompt for a detective-style multiple-choice reasoning question."""

	system_prompt = """
	You are a brilliant detective specializing in symbolic logic and pattern recognition.
	You will be given a context containing logical rules regarding specific alphanumeric codes and a resulting question.

	Your task:
	1. Carefully parse the context to identify facts (who is what) and rules (who is afraid of whom).
	2. Perform step-by-step deductive reasoning to trace the relationship from the subject in the question to the final answer.
	3. Give the correct answer as the EXACT alphanumeric code from the text.

	Your entire output MUST use exactly the following format and nothing else (no text before, between, or after these tags):

	<reasoning>
	[here you write your chain-of-thought reasoning, explicitly linking the individual to their group and the group to the object of their fear]
	</reasoning>
	<answer>
	[here you output ONLY the exact alphanumeric code answer]
	</answer>
	"""

	user_prompt = f"""
	Context:
	{context}

	Problem:
	{problem}

	Solve this problem step by step using detective reasoning to find the logical connection, then provide your final answer in one word ONLY.
	"""

	return system_prompt, user_prompt


	def extract_reasoning(response):
	"""Extract chain-of-thought reasoning from <reasoning>...</reasoning> tags, if present."""
	match = re.search(r'<reasoning>(.*?)</reasoning>', response, re.IGNORECASE \| re.DOTALL)
	if match:
	return match.group(1).strip()
	return None


	def extract_answer(response):
	"""
	Extract the alphanumeric code string from the <answer>...</answer> block.
	Example: <answer> SPmgoBnY </answer> -> Returns "SPmgoBnY"
	"""
	if not response:
	return None

	match = re.search(r'<answer>(.*?)</answer>', response, re.IGNORECASE \| re.DOTALL)

	if match:
	clean_answer = match.group(1).strip()

	clean_answer = clean_answer.rstrip('.')

	return clean_answer

	return None


	def evaluate_on_neulr_deductive(model, tokenizer, max_samples=None, model_name="Model", batch_size=1, split='train'):
	"""Evaluate model on neulr_deductive dataset with batch processing support."""
	print(f"\n🔍 Evaluating {model_name} on neulr_deductive dataset...")
	print(f" Batch size: {batch_size}")
	print(f" Split: {split}")

	# Load neulr_deductive dataset
	print(f"Loading neulr_deductive dataset (split={split})...")
	dataset = load_dataset("json", data_files="/home/moein_salimi/users/amirmo/AbductiveReasoning/datasets/NeuLR/deductive_neutral.json")["train"]

	if max_samples:
	dataset = dataset.select(range(min(max_samples, len(dataset))))
	print(f"Evaluating on {len(dataset)} samples (limited)")
	else:
	print(f"Evaluating on {len(dataset)} samples (full dataset)")

	results = []
	correct = 0
	total = 0
	failed_extractions = 0

	# Process in batches
	num_batches = (len(dataset) + batch_size - 1) // batch_size
	btime = time.time()

	for batch_idx in tqdm(range(num_batches), desc=f"Evaluating {model_name}"):
	# Get batch
	start_idx = batch_idx * batch_size
	end_idx = min(start_idx + batch_size, len(dataset))
	batch = dataset[start_idx:end_idx]

	# Handle both single sample and batch cases
	if not isinstance(batch['question'], list):
	batch = {k: [v] for k, v in batch.items()}

	batch_size_actual = len(batch["question"])


	# Prepare prompts for batch
	formatted_prompts = []
	true_answers = []
	batch_data = []

	for i in range(batch_size_actual):
	problem = batch["question"][i]
	context = batch["context"][i]
	true_answer = str(batch["label"][i])

	# Create prompt
	system_prompt, user_prompt = create_neulr_deductive_prompt(problem, context)

	# Format with chat template if available
	try:
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	]
	formatted_prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)
	except:
	# Fallback if chat template not available
	formatted_prompt = f"{system_prompt}\n\n{user_prompt}"

	formatted_prompts.append(formatted_prompt)
	true_answers.append(true_answer)
	batch_data.append({
	'question': problem,
	'id': batch['id'][i] if 'id' in batch else start_idx + i
	})

	# Tokenize batch with padding
	inputs = tokenizer(
	formatted_prompts,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=2048
	)
	inputs = {k: v.to(model.device) for k, v in inputs.items()}

	# Generate for batch
	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=4096, # Need more tokens for math reasoning
	temperature=0.0, # Low temperature for more accurate answers
	do_sample=False,
	# top_p=0.95,
	pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
	)

	# Process each output in batch
	for i in range(batch_size_actual):
	# Decode response (skip input tokens)
	input_length = inputs['input_ids'][i].shape[0]
	response = tokenizer.decode(outputs[i][input_length:], skip_special_tokens=True)

	# Extract answer
	predicted_answer = extract_answer(response)

	# Extract reasoning
	# reasoning = extract_reasoning(response)
	reasoning = response

	if predicted_answer is None:
	failed_extractions += 1
	predicted_answer = -1 # Mark as failed

	# Check correctness
	true_answer = true_answers[i]
	is_correct = (predicted_answer == true_answer)
	if is_correct:
	correct += 1
	total += 1

	# Store result
	results.append({
	'problem_id': batch_data[i]['id'],
	'question': batch_data[i]['question'],
	'true_answer': true_answer,
	'predicted_answer': predicted_answer,
	'reasoning': reasoning,
	'correct': is_correct
	})

	etime = time.time()
	print(f"Batch processing time: {etime - btime:.2f} seconds")
	accuracy = correct / total if total > 0 else 0.0

	# Calculate additional metrics
	extraction_rate = (total - failed_extractions) / total if total > 0 else 0.0

	print(f"\n📊 {model_name} Results:")
	print(f" Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%) - {correct}/{total} correct")
	print(f" Extraction Rate: {extraction_rate:.4f} ({extraction_rate*100:.2f}%) - {total - failed_extractions}/{total} extracted")
	print(f" Failed extractions: {failed_extractions}/{total} ({failed_extractions/total*100:.1f}%)")

	return {
	'accuracy': accuracy,
	'correct': correct,
	'total': total,
	'failed_extractions': failed_extractions,
	'extraction_rate': extraction_rate,
	'time': etime - btime,
	'results': results
	}


	def evaluate_model_with_dynamic_batch(model, tokenizer, args, model_name):
	"""Evaluate a model with automatic batch-size backoff to avoid CUDA OOM."""
	results = None
	batch_size = args.batch_size

	while batch_size >= 1 and results is None:
	try:
	print(f"\n🧪 Evaluating {model_name} with batch_size={batch_size}")
	results = evaluate_on_neulr_deductive(
	model,
	tokenizer,
	args.max_samples,
	model_name,
	batch_size,
	args.split
	)
	print(f"✅ {model_name} evaluation succeeded with batch_size={batch_size}")
	except torch.cuda.OutOfMemoryError:
	print(f"⚠️ CUDA OutOfMemoryError at batch_size={batch_size}, halving batch size...")
	results = None
	except RuntimeError as e:
	if "out of memory" in str(e).lower():
	print(f"⚠️ RuntimeError OOM at batch_size={batch_size}, halving batch size...")
	results = None
	else:
	raise

	if results is None:
	torch.cuda.empty_cache()
	batch_size = batch_size // 2

	if results is None:
	print(f"❌ {model_name}: still out of memory even with batch_size < 1, giving up.")

	return results

	def ensure_raw_results_cached(args):
	"""
	Ensure raw neulr_deductive results are cached on disk for the current configuration.
	Returns the loaded or newly computed raw_results dict.
	"""
	dataset_name = "neulr_deductive"
	split = args.split
	sample_tag = f"max{args.max_samples}" if args.max_samples else "all"

	raw_results_dir = os.path.join(OUTPUT_DIR, "raw_model", dataset_name)
	os.makedirs(raw_results_dir, exist_ok=True)

	raw_results_file = os.path.join(
	raw_results_dir,
	f"raw_results_train_all.json"
	)

	if os.path.exists(raw_results_file):
	print(f"\n📂 Found cached raw model results: {raw_results_file}")
	with open(raw_results_file, "r") as f:
	raw_results = json.load(f)
	return raw_results

	print("\n🔁 No cached raw model results found for this configuration.")
	print(" Running raw model once and caching per-sample results...")

	raw_model, raw_tokenizer = load_raw_model(args.cuda_device)
	raw_results = evaluate_model_with_dynamic_batch(
	raw_model, raw_tokenizer, args, "Raw Model (cached)"
	)
	del raw_model
	torch.cuda.empty_cache()

	if raw_results is None:
	print("❌ Failed to compute raw model results; cannot cache.")
	return None

	raw_results_with_meta = {
	"model_path": RAW_MODEL_PATH,
	"dataset": dataset_name,
	"split": split,
	"max_samples": args.max_samples,
	**raw_results
	}

	with open(raw_results_file, "w") as f:
	json.dump(raw_results_with_meta, f, indent=2)
	print(f"💾 Cached raw model results saved to: {raw_results_file}")

	return raw_results_with_meta

	def ensure_finetuned_results_cached(args, ckpt_name):
	"""
	Ensure fine-tuned model results are cached on disk for the current configuration.
	Returns the loaded or newly computed fine-tuned results dict.
	"""
	dataset_name = "neulr_deductive"
	ckpt_output_dir = os.path.join("/".join(OUTPUT_DIR.split("/")[:]), args.run, ckpt_name, dataset_name)
	if os.path.exists(ckpt_output_dir) and os.path.exists(os.path.join(ckpt_output_dir, "disagreement_cases.json")) and os.path.exists(os.path.join(ckpt_output_dir, "all_cases.json")):
	print(f"\n📂 Found cached fine-tuned model results: {ckpt_output_dir}")
	return True

	print("\n🔁 No cached fine-tuned model results found for this configuration.")
	return False


	def evaluate_checkpoint_cases(args, checkpoint_path):
	"""
	Given a single checkpoint, evaluate it vs cached raw results and save:
	- all_cases.json
	- disagreement_cases.json
	under: OUTPUT_DIR/<checkpoint_name>/neulr_deductive/
	"""
	print(f"\n📁 Checkpoint path argument received: {checkpoint_path}")
	if not os.path.isabs(checkpoint_path):
	checkpoint_path = os.path.abspath(checkpoint_path)
	print(f" Converted to absolute path: {checkpoint_path}")

	if not os.path.exists(checkpoint_path):
	print(f"❌ Error: Checkpoint path does not exist: {checkpoint_path}")
	print(f" Please check the path and try again.")
	return

	ckpt_name = os.path.basename(checkpoint_path.rstrip("/"))
	print(f"✅ Using checkpoint for per-case evaluation: {ckpt_name}")

	# Get cached (or newly computed) raw results
	raw_results = ensure_raw_results_cached(args)
	if raw_results is None:
	print("❌ Cannot evaluate checkpoint without raw model results.")
	return

	# Get cached (or newly computed) fine-tuned results
	if ensure_finetuned_results_cached(args, ckpt_name):
	print(f"✅ Using cached fine-tuned model results for per-case evaluation: {ckpt_name}")
	return

	# Evaluate fine-tuned checkpoint
	finetuned_model, finetuned_tokenizer = load_finetuned_model(checkpoint_path, args.cuda_device)
	finetuned_results = evaluate_model_with_dynamic_batch(
	finetuned_model,
	finetuned_tokenizer,
	args,
	f"Fine-tuned Model ({ckpt_name})"
	)
	del finetuned_model
	torch.cuda.empty_cache()

	if finetuned_results is None:
	print("❌ Fine-tuned model evaluation failed; aborting.")
	return

	# Build per-case comparison
	dataset_name = "neulr_deductive"
	ckpt_output_dir = os.path.join("/".join(OUTPUT_DIR.split("/")[:]), args.run, ckpt_name, dataset_name)
	os.makedirs(ckpt_output_dir, exist_ok=True)

	raw_by_id = {idx + 1: r for idx, r in enumerate(raw_results["results"])}
	ft_by_id = {idx + 1: r for idx, r in enumerate(finetuned_results["results"])}

	disagreement_cases = []

	for pid, raw_r in raw_by_id.items():
	if pid not in ft_by_id:
	continue
	ft_r = ft_by_id[pid]

	case_entry = {
	"problem_id": pid,
	"problem": raw_r["question"],
	"true_answer": raw_r["true_answer"],
	"raw": {
	"predicted_answer": raw_r["predicted_answer"],
	"reasoning": raw_r["reasoning"],
	"correct": raw_r["correct"]
	},
	"finetuned": {
	"predicted_answer": ft_r["predicted_answer"],
	"reasoning": ft_r["reasoning"],
	"correct": ft_r["correct"]
	}
	}

	if raw_r["correct"] == ft_r["correct"]:
	continue

	if raw_r["correct"] and not ft_r["correct"]:
	disagreement_type = "raw_correct_finetuned_wrong"
	else:
	disagreement_type = "finetuned_correct_raw_wrong"

	disagreement_cases.append({
	**case_entry,
	"disagreement_type": disagreement_type
	})

	disagreement_file = os.path.join(ckpt_output_dir, "disagreement_cases.json")
	with open(disagreement_file, "w") as f:
	json.dump(disagreement_cases, f, indent=2)
	print(f"💾 Disagreement cases saved to: {disagreement_file}")

	finetune_results_with_meta = {
	"dataset": dataset_name,
	"max_samples": args.max_samples,
	**finetuned_results
	}

	finetune_results_file = os.path.join(ckpt_output_dir, "all_cases.json")
	with open(finetune_results_file, "w") as f:
	json.dump(finetune_results_with_meta, f, indent=2)
	print(f"💾 finetune model results saved to: {finetune_results_file}")

	return {
	"raw_results": raw_results,
	"finetuned_results": finetuned_results,
	"all_cases_file": finetune_results_file,
	"disagreement_file": disagreement_file
	}


	def save_results(raw_results, finetuned_results, best_checkpoint_info, output_dir):
	"""Save evaluation results to JSON files."""
	os.makedirs(output_dir, exist_ok=True)

	timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

	# Save raw model results
	raw_output = {
	'model': RAW_MODEL_PATH,
	'evaluation_time': timestamp,
	'metrics': {
	'accuracy': raw_results['accuracy'],
	'extraction_rate': raw_results['extraction_rate']
	},
	'correct': raw_results['correct'],
	'total': raw_results['total'],
	'failed_extractions': raw_results['failed_extractions'],
	'detailed_results': raw_results['results']
	}

	raw_file = os.path.join(output_dir, f"raw_model_results_{timestamp}.json")
	with open(raw_file, 'w') as f:
	json.dump(raw_output, f, indent=2)
	print(f"\n💾 Raw model results saved to: {raw_file}")

	# Save fine-tuned model results
	finetuned_output = {
	'base_model': RAW_MODEL_PATH,
	'checkpoint': best_checkpoint_info['path'],
	'validation_score': best_checkpoint_info['score'],
	'evaluation_time': timestamp,
	'metrics': {
	'accuracy': finetuned_results['accuracy'],
	'extraction_rate': finetuned_results['extraction_rate']
	},
	'correct': finetuned_results['correct'],
	'total': finetuned_results['total'],
	'failed_extractions': finetuned_results['failed_extractions'],
	'detailed_results': finetuned_results['results']
	}

	finetuned_file = os.path.join(output_dir, f"finetuned_model_results_{timestamp}.json")
	with open(finetuned_file, 'w') as f:
	json.dump(finetuned_output, f, indent=2)
	print(f"💾 Fine-tuned model results saved to: {finetuned_file}")

	# Save comparison summary
	improvement = finetuned_results['accuracy'] - raw_results['accuracy']
	relative_improvement = (improvement / raw_results['accuracy'] * 100) if raw_results['accuracy'] > 0 else 0

	extraction_improvement = finetuned_results['extraction_rate'] - raw_results['extraction_rate']

	summary = {
	'evaluation_time': timestamp,
	'dataset': 'yentinglin/neulr_deductive',
	'split': 'train',
	'num_samples': raw_results['total'],
	'raw_model': {
	'path': RAW_MODEL_PATH,
	'metrics': {
	'accuracy': raw_results['accuracy'],
	'extraction_rate': raw_results['extraction_rate']
	},
	'correct': raw_results['correct'],
	'total': raw_results['total'],
	'failed_extractions': raw_results['failed_extractions']
	},
	'finetuned_model': {
	'base_model': RAW_MODEL_PATH,
	'checkpoint': best_checkpoint_info['path'],
	'validation_score': best_checkpoint_info['score'],
	'metrics': {
	'accuracy': finetuned_results['accuracy'],
	'extraction_rate': finetuned_results['extraction_rate']
	},
	'correct': finetuned_results['correct'],
	'total': finetuned_results['total'],
	'failed_extractions': finetuned_results['failed_extractions']
	},
	'comparison': {
	'accuracy_improvement': improvement,
	'accuracy_relative_improvement_percent': relative_improvement,
	'extraction_improvement': extraction_improvement,
	'overall_improved': improvement > 0
	}
	}

	summary_file = os.path.join(output_dir, f"comparison_summary_{timestamp}.json")
	with open(summary_file, 'w') as f:
	json.dump(summary, f, indent=2)
	print(f"💾 Comparison summary saved to: {summary_file}")

	# Save disagreement and all cases summary
	raw_by_id = {r['problem_id']: r for r in raw_results['results']}
	ft_by_id = {r['problem_id']: r for r in finetuned_results['results']}

	disagreement_cases, all_cases = [], []

	for pid, raw_r in raw_by_id.items():
	if pid not in ft_by_id:
	continue
	ft_r = ft_by_id[pid]

	all_cases.append({
	"problem_id": pid,
	"problem": raw_r["question"],
	"true_answer": raw_r["true_answer"],
	"raw": {
	"predicted_answer": raw_r["predicted_answer"],
	"reasoning": raw_r["reasoning"],
	"correct": raw_r["correct"]
	},
	"finetuned": {
	"predicted_answer": ft_r["predicted_answer"],
	"reasoning": ft_r["reasoning"],
	"correct": ft_r["correct"]
	}
	})

	if raw_r['correct'] == ft_r['correct']:
	continue

	if raw_r['correct'] and not ft_r['correct']:
	disagreement_type = "raw_correct_finetuned_wrong"
	else:
	disagreement_type = "finetuned_correct_raw_wrong"

	disagreement_cases.append({
	"problem_id": pid,
	"problem": raw_r["question"],
	"true_answer": raw_r["true_answer"],
	"raw": {
	"predicted_answer": raw_r["predicted_answer"],
	"reasoning": raw_r["reasoning"],
	"correct": raw_r["correct"]
	},
	"finetuned": {
	"predicted_answer": ft_r["predicted_answer"],
	"reasoning": ft_r["reasoning"],
	"correct": ft_r["correct"]
	},
	"disagreement_type": disagreement_type
	})

	disagreement_file = os.path.join(output_dir, f"disagreement_cases_{timestamp}.json")
	with open(disagreement_file, "w") as f:
	json.dump(disagreement_cases, f, indent=2)
	print(f"💾 Disagreement cases saved to: {disagreement_file}")

	all_cases_file = os.path.join(output_dir, f"all_cases_{timestamp}.json")
	with open(all_cases_file, "w") as f:
	json.dump(all_cases, f, indent=2)
	print(f"💾 All cases saved to: {all_cases_file}")

	return summary

	def evaluate_all_checkpoints(args):
	"""Evaluate all checkpoints in a directory."""
	checkpoint_dir = args.checkpoint_dir

	# Handle relative vs absolute paths
	if not os.path.isabs(checkpoint_dir):
	checkpoint_dir = os.path.abspath(checkpoint_dir)

	if not os.path.exists(checkpoint_dir):
	print(f"❌ Error: Checkpoint directory does not exist: {checkpoint_dir}")
	return

	print("="*80)
	print("🚀 neulr_deductive EVALUATION: ALL CHECKPOINTS")
	print("="*80)
	print(f"Checkpoint Directory: {checkpoint_dir}")
	print(f"CUDA Device: {args.cuda_device}")
	print(f"Batch Size: {args.batch_size}")
	if args.max_samples:
	print(f"Max Samples: {args.max_samples}")
	print("="*80)

	# Find all checkpoint directories
	all_items = os.listdir(checkpoint_dir)
	checkpoint_dirs = [
	d for d in all_items
	if d.startswith('checkpoint-') and os.path.isdir(os.path.join(checkpoint_dir, d))
	]

	if not checkpoint_dirs:
	print(f"❌ No checkpoint directories found in: {checkpoint_dir}")
	print(f" Looking for directories named 'checkpoint-*'")
	return

	# Sort checkpoints by number
	checkpoint_dirs.sort(key=lambda x: int(x.split('-')[1]))

	print(f"\n📁 Found {len(checkpoint_dirs)} checkpoints:")
	for ckpt in checkpoint_dirs:
	print(f" - {ckpt}")
	print()

	# Optionally evaluate raw model once
	raw_results = None
	if not args.skip_raw:
	print("\n" + "="*80)
	print("🤖 EVALUATING RAW MODEL (once)")
	print("="*80)
	raw_model, raw_tokenizer = load_raw_model(args.cuda_device)
	raw_results = evaluate_on_neulr_deductive(raw_model, raw_tokenizer, args.max_samples, "Raw Model", args.batch_size)
	del raw_model
	torch.cuda.empty_cache()
	print(f"\n✅ Raw model evaluation complete")
	print(f" Accuracy: {raw_results['accuracy']:.4f} ({raw_results['accuracy']*100:.2f}%)")

	# Save detailed results to JSON
	timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	summary_data = {
	'evaluation_time': timestamp,
	'checkpoint_directory': checkpoint_dir,
	'num_checkpoints_evaluated': len(checkpoint_dirs),
	'raw_model': {
	'path': RAW_MODEL_PATH,
	'results': raw_results if raw_results else 'not_evaluated'
	},
	'checkpoints': []
	}

	summary_file = os.path.join(OUTPUT_DIR, f"all_checkpoints_summary_{timestamp}.json")
	with open(summary_file, 'w') as f:
	json.dump(summary_data, f, indent=2)

	# Evaluate each checkpoint
	all_checkpoint_results = []

	for i, ckpt_name in enumerate(checkpoint_dirs, 1):
	checkpoint_path = os.path.join(checkpoint_dir, ckpt_name)

	print("\n" + "="*80)
	print(f"🎯 EVALUATING CHECKPOINT {i}/{len(checkpoint_dirs)}: {ckpt_name}")
	print("="*80)

	try:
	# Load and evaluate checkpoint
	finetuned_model, finetuned_tokenizer = load_finetuned_model(checkpoint_path, args.cuda_device)
	finetuned_results = evaluate_on_neulr_deductive(
	finetuned_model, finetuned_tokenizer, args.max_samples,
	f"{ckpt_name}", args.batch_size
	)
	del finetuned_model
	torch.cuda.empty_cache()

	# Store results
	checkpoint_info = {
	'checkpoint_name': ckpt_name,
	'checkpoint_path': checkpoint_path,
	'results': finetuned_results
	}

	summary_data["checkpoints"].append({
	'name': checkpoint_info['checkpoint_name'],
	'path': checkpoint_info['checkpoint_path'],
	'metrics': {
	'accuracy': checkpoint_info['results']['accuracy'],
	'extraction_rate': checkpoint_info['results']['extraction_rate']
	},
	'improvements_vs_raw': {
	'accuracy_delta': checkpoint_info['results']['accuracy'] - raw_results['accuracy'] if raw_results else None,
	'extraction_delta': checkpoint_info['results']['extraction_rate'] - raw_results['extraction_rate'] if raw_results else None
	} if raw_results else None
	})

	with open(summary_file, 'w') as f:
	json.dump(summary_data, f, indent=2)

	all_checkpoint_results.append(checkpoint_info)

	print(f"\n✅ {ckpt_name} evaluation complete")
	print(f" Accuracy: {finetuned_results['accuracy']:.4f} ({finetuned_results['accuracy']*100:.2f}%) - {finetuned_results['correct']}/{finetuned_results['total']} correct")
	print(f" Extraction Rate: {finetuned_results['extraction_rate']:.4f} ({finetuned_results['extraction_rate']*100:.2f}%)")

	# Show improvement vs raw model if available
	if raw_results:
	acc_improvement = finetuned_results['accuracy'] - raw_results['accuracy']
	ext_improvement = finetuned_results['extraction_rate'] - raw_results['extraction_rate']
	print(f" 📈 Improvement vs Raw: Accuracy {acc_improvement:+.4f} ({acc_improvement100:+.2f}%), Extraction {ext_improvement:+.4f} ({ext_improvement100:+.2f}%)")

	except Exception as e:
	print(f"❌ Error evaluating {ckpt_name}: {e}")
	continue

	# Save all results
	timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

	# Create summary comparison
	print("\n" + "="*80)
	print("📊 SUMMARY: ALL CHECKPOINTS COMPARISON")
	print("="*80)

	if raw_results:
	print(f"\n🤖 RAW MODEL:")
	print(f" Accuracy: {raw_results['accuracy']:.4f} ({raw_results['accuracy']*100:.2f}%)")
	print(f" Extraction Rate: {raw_results['extraction_rate']:.4f} ({raw_results['extraction_rate']*100:.2f}%)")

	print(f"\n🎯 FINE-TUNED CHECKPOINTS:")
	if raw_results:
	print(f" {'Checkpoint':<20} {'Accuracy':<15} {'Extraction':<15} {'Acc Δ':<12} {'Ext Δ':<12}")
	print(f" {'-'*80}")

	for checkpoint_info in all_checkpoint_results:
	res = checkpoint_info['results']
	acc_delta = res['accuracy'] - raw_results['accuracy']
	ext_delta = res['extraction_rate'] - raw_results['extraction_rate']

	print(f" {checkpoint_info['checkpoint_name']:<20} "
	f"{res['accuracy']:.4f} ({res['accuracy']*100:5.2f}%) "
	f"{res['extraction_rate']:.4f} "
	f"{acc_delta:+.4f} "
	f"{ext_delta:+.4f}")
	else:
	print(f" {'Checkpoint':<20} {'Accuracy':<15} {'Extraction Rate':<15}")
	print(f" {'-'*60}")

	for checkpoint_info in all_checkpoint_results:
	res = checkpoint_info['results']
	print(f" {checkpoint_info['checkpoint_name']:<20} "
	f"{res['accuracy']:.4f} ({res['accuracy']*100:5.2f}%) "
	f"{res['extraction_rate']:.4f} ({res['extraction_rate']*100:5.2f}%)")

	# Find best checkpoint
	if all_checkpoint_results:
	best_ckpt = max(all_checkpoint_results, key=lambda x: x['results']['accuracy'])
	print(f"\n🏆 BEST CHECKPOINT: {best_ckpt['checkpoint_name']}")
	print(f" Accuracy: {best_ckpt['results']['accuracy']:.4f} ({best_ckpt['results']['accuracy']*100:.2f}%)")
	print(f" Extraction Rate: {best_ckpt['results']['extraction_rate']:.4f} ({best_ckpt['results']['extraction_rate']*100:.2f}%)")

	if raw_results:
	best_acc_imp = best_ckpt['results']['accuracy'] - raw_results['accuracy']
	best_rel_imp = (best_acc_imp / raw_results['accuracy'] * 100) if raw_results['accuracy'] > 0 else 0
	print(f" 📈 Improvement vs Raw: Accuracy {best_acc_imp:+.4f} ({best_acc_imp*100:+.2f}%), Relative {best_rel_imp:+.2f}%")

	print(f"\n💾 All results saved to: {summary_file}")
	print("="*80 + "\n")

	def print_comparison(summary):
	"""Print formatted comparison results."""
	print("\n" + "="*80)
	print("📊 neulr_deductive EVALUATION: RAW vs FINE-TUNED MODEL")
	print("="*80)

	raw_metrics = summary['raw_model']['metrics']
	ft_metrics = summary['finetuned_model']['metrics']

	print("\n🤖 RAW MODEL:")
	print(f" Accuracy: {raw_metrics['accuracy']:.4f} ({raw_metrics['accuracy']*100:.2f}%) - {summary['raw_model']['correct']}/{summary['raw_model']['total']} correct")
	print(f" Extraction Rate: {raw_metrics['extraction_rate']:.4f} ({raw_metrics['extraction_rate']*100:.2f}%)")

	print("\n🎯 FINE-TUNED MODEL:")
	print(f" Checkpoint: {os.path.basename(summary['finetuned_model']['checkpoint'])}")
	val_score = summary['finetuned_model']['validation_score']
	val_score_str = f"{val_score:.4f}" if isinstance(val_score, (int, float)) else str(val_score)
	print(f" Validation Score: {val_score_str}")
	print(f" Accuracy: {ft_metrics['accuracy']:.4f} ({ft_metrics['accuracy']*100:.2f}%) - {summary['finetuned_model']['correct']}/{summary['finetuned_model']['total']} correct")
	print(f" Extraction Rate: {ft_metrics['extraction_rate']:.4f} ({ft_metrics['extraction_rate']*100:.2f}%)")

	print("\n📈 IMPROVEMENTS:")
	comp = summary['comparison']
	acc_imp = comp['accuracy_improvement']
	acc_rel = comp['accuracy_relative_improvement_percent']
	ext_imp = comp['extraction_improvement']

	print(f" Accuracy: {acc_imp:+.4f} ({acc_imp*100:+.2f}%) \| Relative: {acc_rel:+.2f}%")
	print(f" Extraction: {ext_imp:+.4f} ({ext_imp*100:+.2f}%)")

	print("\n" + "-"*80)

	if comp['overall_improved']:
	print("✅ RESULT: Fine-tuning on your dataset IMPROVED performance on neulr_deductive!")
	print(f" • Accuracy improved by {acc_rel:.2f}% (relative)")
	print(f" The model shows better math problem solving ability.")
	elif acc_imp < 0:
	print("⚠️ RESULT: Fine-tuning on your dataset DECREASED performance on neulr_deductive.")
	print(f" • Accuracy decreased by {acc_rel:.2f}% (relative)")
	print(f" • This suggests potential overfitting to your training data.")
	else:
	print("➖ RESULT: Fine-tuning had NO SIGNIFICANT IMPACT on neulr_deductive performance.")
	print(f" The model maintained baseline math problem solving ability.")

	print("="*80 + "\n")

	def main():
	global RAW_MODEL_PATHm, OUTPUT_DIR
	parser = argparse.ArgumentParser(description='Evaluate raw vs fine-tuned model on neulr_deductive dataset')
	parser.add_argument('--max_samples', type=int, default=None,
	help='Maximum number of samples to evaluate (default: all 30 problems)')
	parser.add_argument('--cuda_device', type=str, default='0',
	help='CUDA device to use (default: 0)')
	parser.add_argument('--batch_size', type=int, default=1,
	help='Batch size for evaluation. Higher values (4-8) are faster but use more GPU memory (default: 1)')
	parser.add_argument('--split', type=str, default='train', choices=['train', 'test', 'validation'],
	help='Dataset split to use (default: train). Note: neulr_deductive dataset may only have "train" split.')
	parser.add_argument('--skip_raw', action='store_true',
	help='Skip raw model evaluation (evaluate only fine-tuned model)')
	parser.add_argument('--skip_finetuned', action='store_true',
	help='Skip fine-tuned model evaluation (evaluate only raw model)')
	parser.add_argument('--checkpoint_path', type=str, default=None,
	help='Path to specific checkpoint to evaluate (e.g., /path/to/checkpoint-640). '
	'If not provided, automatically selects the best checkpoint based on validation metrics.')
	parser.add_argument('--checkpoint_dir', type=str, default=None,
	help='Path to directory containing multiple checkpoints (e.g., /path/to/checkpoint/). '
	'Will evaluate ALL checkpoint-* directories found. Cannot be used with --checkpoint_path.')
	parser.add_argument('--evaluate_checkpoints', type=int, default=0,
	help='If set to 1, run per-checkpoint mode: '
	'evaluate the given --checkpoint_path vs cached raw results and '
	'save all_cases/disagreement_cases under OUTPUT_DIR/checkpoint/dataset_name.')
	parser.add_argument('--run', type=str, default="run",
	help='Which training run to use for the output directory.')
	parser.add_argument('--raw_path', type=str, default=None,
	help='The raw model path')
	parser.add_argument('--output_path', type=str, default=OUTPUT_DIR,
	help='Model output path, defaults to env variable.')

	args = parser.parse_args()

	OUTPUT_DIR = args.output_path

	# Validate arguments
	if args.checkpoint_path and args.checkpoint_dir:
	print("❌ Error: Cannot use both --checkpoint_path and --checkpoint_dir")
	print(" Use --checkpoint_path for a single checkpoint")
	print(" Use --checkpoint_dir to evaluate all checkpoints in a directory")
	return

	if args.evaluate_checkpoints == 1 and args.checkpoint_dir:
	print("❌ Error: --evaluate_checkpoints 1 is only supported with --checkpoint_path (single checkpoint).")
	print(" Please pass a single --checkpoint_path, or omit --evaluate_checkpoints to use --checkpoint_dir.")
	return

	# Set CUDA device
	os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_device

	if args.raw_path:
	RAW_MODEL_PATH = args.raw_path

	# Special mode: per-checkpoint evaluation with cached raw results
	if args.evaluate_checkpoints == 1:
	if not args.checkpoint_path:
	print("❌ Error: --evaluate_checkpoints 1 requires --checkpoint_path to be set.")
	return

	print("="*80)
	print("🚀 neulr_deductive PER-CHECKPOINT EVALUATION MODE")
	print("="*80)
	print(f"Raw Model: {RAW_MODEL_PATH}")
	print(f"Output Dir: {OUTPUT_DIR}")
	print(f"CUDA Device: {args.cuda_device}")
	print(f"Split: {args.split}")
	if args.max_samples:
	print(f"Max Samples: {args.max_samples}")
	print(f"Checkpoint: {args.checkpoint_path}")
	print("="*80)

	evaluate_checkpoint_cases(args, args.checkpoint_path)
	print(f"\n✅ Per-checkpoint evaluation finished for: {args.checkpoint_path}")
	print(f" Results root directory: {OUTPUT_DIR}")
	return

	# If checkpoint_dir is provided, evaluate all checkpoints
	if args.checkpoint_dir:
	evaluate_all_checkpoints(args)
	return

	print("="*70)
	print("🚀 neulr_deductive EVALUATION: RAW vs FINE-TUNED")
	print("="*70)
	print(f"Raw Model: {RAW_MODEL_PATH}")
	print(f"Training Dir: {TRAINING_DIR}")
	print(f"CUDA Device: {args.cuda_device}")
	print(f"Batch Size: {args.batch_size}")
	if args.max_samples:
	print(f"Max Samples: {args.max_samples}")
	if args.skip_raw:
	print(f"Mode: Fine-tuned model only")
	elif args.skip_finetuned:
	print(f"Mode: Raw model only")
	else:
	print(f"Mode: Both models (comparison)")
	print("="*70)

	# Determine which checkpoint to use
	if not args.skip_finetuned:
	if args.checkpoint_path:
	# Use user-provided checkpoint
	checkpoint_path = args.checkpoint_path

	# Debug: show what we received
	print(f"\n📁 Checkpoint path argument received: {checkpoint_path}")

	# Handle relative vs absolute paths
	if not os.path.isabs(checkpoint_path):
	checkpoint_path = os.path.abspath(checkpoint_path)
	print(f" Converted to absolute path: {checkpoint_path}")

	if not os.path.exists(checkpoint_path):
	print(f"❌ Error: Checkpoint path does not exist: {checkpoint_path}")
	print(f" Please check the path and try again.")
	return

	print(f"✅ Using user-specified checkpoint: {os.path.basename(checkpoint_path)}")
	best_checkpoint_info = {
	'path': checkpoint_path,
	'score': 'N/A (manually specified)'
	}
	else:
	# Auto-select best checkpoint
	print("\n📁 No checkpoint path provided, auto-selecting best checkpoint...")
	best_checkpoint_path, best_score = find_best_checkpoint(TRAINING_DIR)
	if best_checkpoint_path is None:
	print("❌ No valid checkpoint found!")
	return
	best_checkpoint_info = {
	'path': best_checkpoint_path,
	'score': best_score
	}
	else:
	best_checkpoint_info = None

	# Evaluate raw model
	if not args.skip_raw:
	raw_model, raw_tokenizer = load_raw_model(args.cuda_device)
	raw_results = evaluate_on_neulr_deductive(raw_model, raw_tokenizer, args.max_samples, "Raw Model", args.batch_size)
	del raw_model # Free memory
	torch.cuda.empty_cache()
	else:
	raw_results = None
	print("\n⏭️ Skipping raw model evaluation")

	# Evaluate fine-tuned model
	if not args.skip_finetuned:
	finetuned_model, finetuned_tokenizer = load_finetuned_model(best_checkpoint_info['path'], args.cuda_device)
	finetuned_results = evaluate_on_neulr_deductive(finetuned_model, finetuned_tokenizer, args.max_samples, "Fine-tuned Model", args.batch_size)
	del finetuned_model # Free memory
	torch.cuda.empty_cache()
	else:
	finetuned_results = None
	print("\n⏭️ Skipping fine-tuned model evaluation")

	# Save and display results
	if raw_results and finetuned_results:
	summary = save_results(raw_results, finetuned_results, best_checkpoint_info, OUTPUT_DIR)
	print_comparison(summary)
	elif raw_results:
	print("\n✅ Raw model evaluation completed")
	elif finetuned_results:
	print("\n✅ Fine-tuned model evaluation completed")

	print(f"\n✅ All results saved to: {OUTPUT_DIR}")

	if __name__ == '__main__':
	main()