rrn-qa / code /test_model.py

Upload folder using huggingface_hub

3451ca0 verified 9 months ago

26 kB

	# test_model.py - RRN QA Model evaluation script with multi-step reasoning support
	import torch
	from torch.utils.data import DataLoader
	from transformers import AutoTokenizer, AutoModel, default_data_collator
	from datasets import load_dataset
	from tqdm.auto import tqdm
	import os
	import evaluate as hf_evaluate # Import with alias to avoid naming conflict
	import collections
	import numpy as np
	import logging
	import multiprocessing # For Windows multiprocessing support
	import json
	import argparse
	import matplotlib.pyplot as plt
	from collections import defaultdict

	# Import custom modules and config
	import config
	from model import EnhancedRRN_QA_Model # Import the enhanced model
	# Make sure memory.py and modules.py are accessible

	# --- Configuration ---
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def main():
	# Parse command line arguments
	parser = argparse.ArgumentParser(description="Test RRN QA Model")
	parser.add_argument("--checkpoint", type=str, default="./rrn_qa_model_epoch_3",
	help="Path to checkpoint directory (default: ./rrn_qa_model_epoch_3)")
	parser.add_argument("--batch_size", type=int, default=8,
	help="Evaluation batch size (default: 8)")
	parser.add_argument("--fixed_steps", type=int, default=None,
	help="Override to use fixed number of reasoning steps (default: None, use model's dynamic steps)")
	parser.add_argument("--use_memory", action="store_true",
	help="Enable active memory during evaluation")
	parser.add_argument("--output_dir", type=str, default="./eval_results",
	help="Directory to save evaluation results (default: ./eval_results)")
	parser.add_argument("--visualize", action="store_true",
	help="Generate visualizations of reasoning steps")
	args = parser.parse_args()

	CHECKPOINT_DIR = args.checkpoint
	EVAL_BATCH_SIZE = args.batch_size
	DEVICE = config.DEVICE
	USE_MEMORY = args.use_memory
	OUTPUT_DIR = args.output_dir

	# Create output directory if it doesn't exist
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	logger.info(f"Evaluation configuration:")
	logger.info(f" Checkpoint: {CHECKPOINT_DIR}")
	logger.info(f" Batch size: {EVAL_BATCH_SIZE}")
	logger.info(f" Device: {DEVICE}")
	logger.info(f" Use memory: {USE_MEMORY}")
	logger.info(f" Output directory: {OUTPUT_DIR}")
	if args.fixed_steps is not None:
	logger.info(f" Using fixed {args.fixed_steps} reasoning steps (overriding model config)")

	# --- 1. Load Tokenizer and Model from Checkpoint ---
	logger.info(f"Loading tokenizer from {CHECKPOINT_DIR}...")
	tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_DIR)

	logger.info(f"Loading Enhanced RRN QA Model architecture...")
	# Instantiate the enhanced model architecture
	model = EnhancedRRN_QA_Model(config.BASE_MODEL_NAME)

	# Check if we're loading from a checkpoint with the enhanced architecture
	base_model_path = os.path.join(CHECKPOINT_DIR, "base_model")
	qa_head_path = os.path.join(CHECKPOINT_DIR, "qa_head.pth")
	retroactive_layer_path = os.path.join(CHECKPOINT_DIR, "retroactive_layer.pth")
	gating_mechanism_path = os.path.join(CHECKPOINT_DIR, "gating_mechanism.pth")
	step_controller_path = os.path.join(CHECKPOINT_DIR, "step_controller.pth")

	# Check for required components
	if not os.path.exists(base_model_path):
	logger.error(f"Base model directory not found at: {base_model_path}")
	exit()
	if not os.path.exists(qa_head_path):
	logger.error(f"QA head weights not found at: {qa_head_path}")
	exit()
	if not os.path.exists(retroactive_layer_path):
	logger.error(f"Retroactive layer weights not found at: {retroactive_layer_path}")
	exit()

	# Load base model weights
	logger.info(f"Loading base model weights from {base_model_path}...")
	model.base_model = AutoModel.from_pretrained(base_model_path)

	# Check if we're loading from an enhanced checkpoint or a legacy checkpoint
	is_enhanced_checkpoint = os.path.exists(gating_mechanism_path)

	if is_enhanced_checkpoint:
	# Load all enhanced components
	logger.info("Loading enhanced model components...")
	model.qa_head.load_state_dict(torch.load(qa_head_path, map_location='cpu'))
	model.retroactive_update_layer.load_state_dict(torch.load(retroactive_layer_path, map_location='cpu'))
	model.gating_mechanism.load_state_dict(torch.load(gating_mechanism_path, map_location='cpu'))

	# Load step controller if available (for learned dynamic steps)
	if os.path.exists(step_controller_path) and hasattr(model, "step_controller"):
	logger.info("Loading step controller for learned dynamic steps...")
	model.step_controller.load_state_dict(torch.load(step_controller_path, map_location='cpu'))

	logger.info("Enhanced model loaded successfully.")
	else:
	# We're loading from a legacy checkpoint - need to adapt the weights
	logger.info("Loading from legacy checkpoint - adapting weights to enhanced architecture...")

	# For the QA head, we need to initialize the enhanced QA head from scratch
	# since the architectures are different
	logger.info("Initializing enhanced QA head with random weights...")

	# For the retroactive layer, we can try to load the weights but might need adjustments
	logger.warning("Note: The enhanced model uses a different architecture than the checkpoint.")
	logger.warning("Some components will use random initialization.")

	# Load enhanced config if available
	enhanced_config_path = os.path.join(CHECKPOINT_DIR, "enhanced_config.json")
	if os.path.exists(enhanced_config_path):
	logger.info(f"Loading enhanced configuration from {enhanced_config_path}")
	with open(enhanced_config_path, 'r') as f:
	enhanced_config = json.load(f)

	# Override model configuration with saved values
	if "num_reasoning_steps" in enhanced_config:
	model.num_reasoning_steps = enhanced_config["num_reasoning_steps"]
	logger.info(f"Using {model.num_reasoning_steps} reasoning steps from config")

	if "use_dynamic_steps" in enhanced_config:
	model.use_dynamic_steps = enhanced_config["use_dynamic_steps"]
	if model.use_dynamic_steps:
	model.max_reasoning_steps = enhanced_config.get("max_reasoning_steps", config.MAX_REASONING_STEPS)
	model.min_reasoning_steps = enhanced_config.get("min_reasoning_steps", config.MIN_REASONING_STEPS)
	model.reasoning_step_type = enhanced_config.get("reasoning_step_type", config.REASONING_STEP_TYPE)
	model.early_stop_threshold = enhanced_config.get("early_stop_threshold", config.EARLY_STOP_THRESHOLD)
	logger.info(f"Using dynamic reasoning steps (type: {model.reasoning_step_type})")
	logger.info(f"Min steps: {model.min_reasoning_steps}, Max steps: {model.max_reasoning_steps}")

	# Override with fixed steps if specified
	if args.fixed_steps is not None:
	logger.info(f"Overriding with fixed {args.fixed_steps} reasoning steps")
	model.use_dynamic_steps = False
	model.num_reasoning_steps = args.fixed_steps

	model.to(DEVICE)
	model.eval() # Set model to evaluation mode
	logger.info("Model loaded successfully and set to evaluation mode.")


	# --- 2. Load and Preprocess Validation Dataset ---
	logger.info("Loading SQuAD validation dataset...")
	raw_datasets = load_dataset("squad", split="validation")


	question_column_name = "question"
	context_column_name = "context"
	answer_column_name = "answers"
	pad_on_right = tokenizer.padding_side == "right"

	# Validation preprocessing: Keep example_id and offset_mapping
	def prepare_validation_features(examples):
	examples[question_column_name] = [q.strip() for q in examples[question_column_name]]
	tokenized_examples = tokenizer(
	examples[question_column_name if pad_on_right else context_column_name],
	examples[context_column_name if pad_on_right else question_column_name],
	truncation="only_second" if pad_on_right else "only_first",
	max_length=config.MAX_SEQ_LENGTH,
	stride=config.DOC_STRIDE,
	return_overflowing_tokens=True,
	return_offsets_mapping=True,
	padding="max_length",
	)

	# Keep track of which feature belongs to which example
	sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

	# Add the example_id to link features to original examples
	tokenized_examples["example_id"] = []
	for i in range(len(tokenized_examples["input_ids"])):
	sequence_ids = tokenized_examples.sequence_ids(i)
	context_index = 1 if pad_on_right else 0
	sample_index = sample_mapping[i]
	tokenized_examples["example_id"].append(examples["id"][sample_index])

	# Set offset mapping to None for question tokens to avoid predicting answers there
	tokenized_examples["offset_mapping"][i] = [
	(o if sequence_ids[k] == context_index else None)
	for k, o in enumerate(tokenized_examples["offset_mapping"][i])
	]

	return tokenized_examples

	logger.info("Preprocessing validation dataset...")
	# Disable multiprocessing which can hang on some systems
	logger.info("Using single process for preprocessing to prevent hanging")
	eval_dataset = raw_datasets.map(
	prepare_validation_features,
	batched=True,
	remove_columns=raw_datasets.column_names,
	num_proc=1, # Disable multiprocessing to avoid hanging
	)

	# Custom collator to handle None values in offset_mapping
	def custom_data_collator(features):
	# First, remove offset_mapping which contains None values that can't be batched
	offset_mappings = [f.pop("offset_mapping") for f in features]

	# Use default collator for everything else
	batch = default_data_collator(features)

	# Add offset_mapping back as a list since it can't be converted to a tensor
	batch["offset_mapping"] = offset_mappings

	return batch

	# Use custom data collator
	data_collator = custom_data_collator

	eval_dataloader = DataLoader(
	eval_dataset,
	collate_fn=data_collator,
	batch_size=EVAL_BATCH_SIZE
	)

	# --- 3. Run Inference ---
	logger.info("*** Running Evaluation ***")
	logger.info(f" Num examples = {len(eval_dataset)}")
	logger.info(f" Batch size = {EVAL_BATCH_SIZE}")

	all_start_logits = []
	all_end_logits = []
	feature_indices = [] # Keep track of the order

	# Track multi-step reasoning metrics
	reasoning_steps_taken = []
	delta_magnitudes = []
	gate_values = []
	initial_vs_final_changes = []

	with torch.no_grad():
	for step, batch in enumerate(tqdm(eval_dataloader, desc="Evaluating")):
	# Move batch to device
	batch_on_device = {k: v.to(DEVICE) for k, v in batch.items() if isinstance(v, torch.Tensor)}
	# Store feature indices corresponding to this batch
	# Assuming 'input_ids' or similar key represents features in order
	current_indices = list(range(step * EVAL_BATCH_SIZE, step * EVAL_BATCH_SIZE + len(batch_on_device['input_ids'])))
	feature_indices.extend(current_indices)

	# Forward pass - pass only inputs needed by model.forward
	outputs = model(
	input_ids=batch_on_device.get("input_ids"),
	attention_mask=batch_on_device.get("attention_mask"),
	token_type_ids=batch_on_device.get("token_type_ids"),
	use_memory=USE_MEMORY, # Use memory if enabled
	return_dict=True
	)

	# Get the final logits (y1)
	start_logits = outputs.start_logits
	end_logits = outputs.end_logits

	all_start_logits.append(start_logits.cpu().numpy())
	all_end_logits.append(end_logits.cpu().numpy())

	# Collect multi-step reasoning metrics from custom_outputs
	if hasattr(model, 'custom_outputs'):
	# Number of reasoning steps taken
	if 'steps_taken' in model.custom_outputs:
	reasoning_steps_taken.append(model.custom_outputs['steps_taken'])

	# Delta magnitudes (how much the model updates at each step)
	if 'all_deltas' in model.custom_outputs and len(model.custom_outputs['all_deltas']) > 0:
	batch_deltas = []
	for delta in model.custom_outputs['all_deltas']:
	# Calculate mean delta magnitude across sequence dimension
	delta_norm = delta.norm(dim=-1).mean().cpu().item()
	batch_deltas.append(delta_norm)
	delta_magnitudes.append(batch_deltas)

	# Gate values (how selective the updates are)
	if 'all_gates' in model.custom_outputs and len(model.custom_outputs['all_gates']) > 0:
	batch_gates = []
	for gate in model.custom_outputs['all_gates']:
	# Calculate mean gate value across sequence dimension
	gate_mean = gate.mean().cpu().item()
	batch_gates.append(gate_mean)
	gate_values.append(batch_gates)

	# Compare initial vs final predictions
	if 'y0_start_logits' in model.custom_outputs and 'y0_end_logits' in model.custom_outputs:
	y0_start = model.custom_outputs['y0_start_logits']
	y0_end = model.custom_outputs['y0_end_logits']

	# Calculate how much the predictions changed
	start_change = (start_logits - y0_start).abs().mean().cpu().item()
	end_change = (end_logits - y0_end).abs().mean().cpu().item()
	initial_vs_final_changes.append((start_change + end_change) / 2)

	# Concatenate all results
	all_start_logits = np.concatenate(all_start_logits, axis=0)
	all_end_logits = np.concatenate(all_end_logits, axis=0)

	# Ensure the number of predictions matches the number of features
	if len(all_start_logits) != len(eval_dataset):
	logger.warning(f"Mismatch in prediction count ({len(all_start_logits)}) and feature count ({len(eval_dataset)}). Check dataloader/inference loop.")
	# Attempt to slice if predictions exceed features (might happen if last batch wasn't full)
	all_start_logits = all_start_logits[:len(eval_dataset)]
	all_end_logits = all_end_logits[:len(eval_dataset)]


	# Create dictionary mapping feature index to its logits
	predictions_dict = {
	feature_index: (start_logit, end_logit)
	for feature_index, (start_logit, end_logit) in zip(feature_indices, zip(all_start_logits, all_end_logits))
	}


	# --- 4. Post-Processing ---
	# (Adapted from Hugging Face run_qa.py example script)
	def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30, tokenizer=tokenizer):
	all_start_logits, all_end_logits = zip(*raw_predictions.values())

	# Build a map from example ID to list of related feature indices
	example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
	features_per_example = collections.defaultdict(list)
	for i, feature in enumerate(features):
	features_per_example[example_id_to_index[feature["example_id"]]].append(i)

	# Dictionary to store predictions
	predictions = collections.OrderedDict()

	logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

	# Loop over all examples
	for example_index, example in enumerate(tqdm(examples, desc="Post-processing")):
	feature_indices = features_per_example[example_index] # Indices of features related to this example

	min_null_score = None # Used to identify impossible answers
	valid_answers = []
	context = example["context"]

	# Loop through features associated with the current example
	for feature_index in feature_indices:
	start_logits = all_start_logits[feature_index]
	end_logits = all_end_logits[feature_index]
	offset_mapping = features[feature_index]["offset_mapping"]

	# Update minimum null prediction score
	cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
	feature_null_score = start_logits[cls_index] + end_logits[cls_index]
	if min_null_score is None or min_null_score < feature_null_score:
	min_null_score = feature_null_score

	# Go through all possibilities for start/end positions
	start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
	end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
	for start_index in start_indexes:
	for end_index in end_indexes:
	# Skip invalid pairs (start > end, index out of bounds, answer in question part)
	if start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or \
	offset_mapping[start_index] is None or offset_mapping[end_index] is None or \
	end_index < start_index:
	continue

	# Check answer length
	if end_index - start_index + 1 > max_answer_length:
	continue

	# Extract text and score
	start_char = offset_mapping[start_index][0]
	end_char = offset_mapping[end_index][1]
	score = start_logits[start_index] + end_logits[end_index]

	valid_answers.append({
	"score": score,
	"text": context[start_char: end_char]
	})

	# Select the best answer across all features for this example
	if len(valid_answers) > 0:
	best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
	else:
	# Fallback for no valid answers found
	best_answer = {"text": "", "score": min_null_score} # Assign CLS score if needed

	# Assign final prediction (use empty string if null score is best)
	# Simple version: always take the best scoring valid answer
	# More sophisticated versions might compare best_answer["score"] vs min_null_score
	predictions[example["id"]] = best_answer["text"]


	return predictions

	logger.info("Starting post-processing...")
	final_predictions = postprocess_qa_predictions(raw_datasets, eval_dataset, predictions_dict)


	# --- 5. Compute Metrics ---
	logger.info("Calculating SQuAD metrics...")
	metric = hf_evaluate.load("squad")

	# Format predictions and references for the metric
	formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
	formatted_references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in raw_datasets]

	results = metric.compute(predictions=formatted_predictions, references=formatted_references)

	logger.info("*** Evaluation Results ***")
	print(results)

	# --- 6. Analyze Multi-step Reasoning Metrics ---
	logger.info("\n*** Multi-step Reasoning Analysis ***")

	# Calculate average number of reasoning steps
	if reasoning_steps_taken:
	avg_steps = sum(reasoning_steps_taken) / len(reasoning_steps_taken)
	logger.info(f"Average reasoning steps: {avg_steps:.2f}")

	# Count frequency of each step count
	step_counts = collections.Counter(reasoning_steps_taken)
	logger.info(f"Step count distribution: {dict(sorted(step_counts.items()))}")

	# Calculate average delta magnitudes per step
	if delta_magnitudes:
	# Transpose to get step-wise averages
	steps_delta_magnitudes = defaultdict(list)
	for batch_deltas in delta_magnitudes:
	for step_idx, delta in enumerate(batch_deltas):
	steps_delta_magnitudes[step_idx].append(delta)

	avg_delta_by_step = {step: sum(deltas)/len(deltas) for step, deltas in steps_delta_magnitudes.items()}
	logger.info(f"Average delta magnitude by step: {avg_delta_by_step}")

	# Calculate average gate values per step
	if gate_values:
	# Transpose to get step-wise averages
	steps_gate_values = defaultdict(list)
	for batch_gates in gate_values:
	for step_idx, gate in enumerate(batch_gates):
	steps_gate_values[step_idx].append(gate)

	avg_gate_by_step = {step: sum(gates)/len(gates) for step, gates in steps_gate_values.items()}
	logger.info(f"Average gate value by step: {avg_gate_by_step}")

	# Calculate average change from initial to final predictions
	if initial_vs_final_changes:
	avg_change = sum(initial_vs_final_changes) / len(initial_vs_final_changes)
	logger.info(f"Average change from initial to final predictions: {avg_change:.4f}")

	# --- 7. Save Results ---
	results_file = os.path.join(OUTPUT_DIR, "eval_results.json")
	with open(results_file, 'w') as f:
	# Combine SQuAD metrics with multi-step reasoning metrics
	full_results = {
	"squad_metrics": results,
	"multi_step_metrics": {
	"avg_reasoning_steps": avg_steps if reasoning_steps_taken else None,
	"step_count_distribution": dict(sorted(step_counts.items())) if reasoning_steps_taken else None,
	"avg_delta_by_step": avg_delta_by_step if delta_magnitudes else None,
	"avg_gate_by_step": avg_gate_by_step if gate_values else None,
	"avg_prediction_change": avg_change if initial_vs_final_changes else None
	}
	}
	json.dump(full_results, f, indent=2)

	logger.info(f"Results saved to {results_file}")

	# --- 8. Generate Visualizations (if requested) ---
	if args.visualize and (delta_magnitudes or gate_values or reasoning_steps_taken):
	logger.info("Generating visualizations...")

	# Create visualization directory
	viz_dir = os.path.join(OUTPUT_DIR, "visualizations")
	os.makedirs(viz_dir, exist_ok=True)

	# Plot step distribution
	if reasoning_steps_taken:
	plt.figure(figsize=(10, 6))
	plt.bar(step_counts.keys(), step_counts.values())
	plt.xlabel('Number of Reasoning Steps')
	plt.ylabel('Frequency')
	plt.title('Distribution of Reasoning Steps')
	plt.savefig(os.path.join(viz_dir, 'step_distribution.png'))
	plt.close()

	# Plot delta magnitudes by step
	if delta_magnitudes and steps_delta_magnitudes:
	plt.figure(figsize=(10, 6))
	steps = sorted(steps_delta_magnitudes.keys())
	values = [avg_delta_by_step[step] for step in steps]
	plt.plot(steps, values, marker='o')
	plt.xlabel('Reasoning Step')
	plt.ylabel('Average Delta Magnitude')
	plt.title('Delta Magnitude by Reasoning Step')
	plt.grid(True)
	plt.savefig(os.path.join(viz_dir, 'delta_magnitudes.png'))
	plt.close()

	# Plot gate values by step
	if gate_values and steps_gate_values:
	plt.figure(figsize=(10, 6))
	steps = sorted(steps_gate_values.keys())
	values = [avg_gate_by_step[step] for step in steps]
	plt.plot(steps, values, marker='o')
	plt.xlabel('Reasoning Step')
	plt.ylabel('Average Gate Value')
	plt.title('Gate Value by Reasoning Step')
	plt.grid(True)
	plt.savefig(os.path.join(viz_dir, 'gate_values.png'))
	plt.close()

	logger.info(f"Visualizations saved to {viz_dir}")

	if __name__ == "__main__":
	# This is required for Windows to properly handle multiprocessing
	multiprocessing.freeze_support()
	main()

	# Example usage:
	# Test with default settings (epoch 3 checkpoint):
	# python test_model.py

	# Test with specific checkpoint:
	# python test_model.py --checkpoint ./rrn_qa_model_epoch_2

	# Test with fixed number of reasoning steps:
	# python test_model.py --fixed_steps 3

	# Test with active memory:
	# python test_model.py --use_memory

	# Test with visualizations:
	# python test_model.py --visualize