MAP_EXP_10 / MAP_EXP_10.py
jaytonde05's picture
Upload 12 files
ead2d29 verified
# All imports at the top
import torch
import shutil
import numpy as np
import pandas as pd
import mlflow
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorWithPadding,
BitsAndBytesConfig,
AutoModelForSequenceClassification
)
from peft import (
LoraConfig,
TaskType,
get_peft_model,
prepare_model_for_kbit_training,
)
# Configuration
model_name = "Qwen/Qwen2.5-Math-1.5B"
MAX_LEN = 256
# MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:8081")
# Step 2: Loading the dataset
le = LabelEncoder()
train = pd.read_csv('train.csv')
train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category +":"+ train.Misconception
train['label'] = le.fit_transform(train['target'])
n_classes = len(le.classes_)
print(f"Train shape: {train.shape} with {n_classes} target classes")
print(train.head())
# Process correct answers
idx = train.apply(lambda row: row.Category.split('_')[0], axis=1) == 'True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId', 'MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c', ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId', 'MC_Answer']]
correct['is_correct'] = 1
train = train.merge(correct, on=['QuestionId', 'MC_Answer'], how='left')
train.is_correct = train.is_correct.fillna(0)
# Format input text
def format_input(row):
x = "This answer is correct."
if not row['is_correct']:
x = "This is answer is incorrect."
return (
f"Question: {row['QuestionText']}\n"
f"Answer: {row['MC_Answer']}\n"
f"{x}\n"
f"Student Explanation: {row['StudentExplanation']}"
)
train['text'] = train.apply(format_input, axis=1)
# Split data
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)
COLS = ['text', 'label']
train_ds = Dataset.from_pandas(train_df[COLS])
val_ds = Dataset.from_pandas(val_df[COLS])
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Tokenization function
def tokenize_func(example):
return tokenizer(
example["text"],
add_special_tokens=True,
truncation=True,
max_length=512,
)
# Tokenize datasets
train_ds = train_ds.map(tokenize_func, batched=True, desc="Tokenizing train data")
eval_ds = val_ds.map(tokenize_func, batched=True, desc="Tokenizing eval data")
# Step 3: Load model
# Model configuration
model_kwargs = dict(
trust_remote_code=True,
torch_dtype=torch.float16
)
model_kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype="float16",
)
# Load model
print(f"Loading model : {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(
model_name, use_cache=False, num_labels=n_classes, **model_kwargs
)
model.config.pad_token_id = tokenizer.pad_token_id
# LoRA configuration
lora_config = LoraConfig(
r=64,
lora_alpha=64,
target_modules="all-linear",
lora_dropout=0.05,
bias="none",
task_type=TaskType.SEQ_CLS,
modules_to_save=["score"],
)
# Prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Custom evaluation metric
def compute_multi_map(eval_pred, ks=[3, 5, 10]):
"""
Computes MAP@k and a detailed rank distribution.
This includes:
- Rank counts for rank 1, 2-3, and above 3.
- For rank groups 2-3 and above 3, it finds the top 3 most frequent
classes and calculates their average probability score.
"""
# 1. Unpack logits and labels
logits, labels = eval_pred
labels = np.array(labels)
# 2. Convert logits to probabilities
# The `probs` array has shape: (num_samples, num_classes)
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
# 3. Get top-k predictions
max_k = max(ks)
top_k_preds = np.argsort(-probs, axis=1)[:, :max_k]
# 4. Create a boolean match array
match_array = (top_k_preds == labels[:, None])
# 5. Compute MAP@k for each specified k
metrics = {}
for k in ks:
match_at_k = match_array[:, :k]
ranks = np.argmax(match_at_k, axis=1) + 1
has_match_at_k = np.any(match_at_k, axis=1)
scores = has_match_at_k * (1.0 / ranks)
metrics[f"map@{k}"] = np.mean(scores)
# 6. Calculate detailed rank position breakdown
ranks_with_indices = [np.where(row)[0] for row in match_array]
correct_ranks = np.array([r[0] + 1 if len(r) > 0 else max_k + 1 for r in ranks_with_indices])
total = labels.shape[0]
metrics["rank_1"] = np.sum(correct_ranks == 1)
metrics["rank_2_to_3"] = np.sum((correct_ranks >= 2) & (correct_ranks <= 3))
metrics["rank_above_3"] = np.sum((correct_ranks > 3) & (correct_ranks <= max_k))
metrics["no_match_in_top_k"] = np.sum(correct_ranks > max_k)
metrics["total"] = total
# 7. Find top 3 classes for rank groups and their average probability
# --- For ranks 2 to 3 ---
# Create a boolean mask for samples in this rank group
rank_2_to_3_mask = (correct_ranks >= 2) & (correct_ranks <= 3)
# Get the true labels for these samples
rank_2_to_3_labels = labels[rank_2_to_3_mask]
if len(rank_2_to_3_labels) > 0:
top_classes = Counter(rank_2_to_3_labels).most_common(3)
augmented_top_classes = []
for cls, count in top_classes:
# Find samples that both belong to this class AND are in this rank group
class_in_group_mask = (labels == cls) & rank_2_to_3_mask
# Get the probabilities assigned to the correct class for these specific samples
class_probs = probs[class_in_group_mask, cls]
# Calculate the average probability and add to list
avg_prob = np.mean(class_probs)
augmented_top_classes.append((cls, count, round(float(avg_prob), 4)))
metrics["rank_2_to_3_details"] = augmented_top_classes
else:
metrics["rank_2_to_3_details"] = []
# --- For ranks above 3 (up to max_k) ---
rank_above_3_mask = (correct_ranks > 3) & (correct_ranks <= max_k)
rank_above_3_labels = labels[rank_above_3_mask]
if len(rank_above_3_labels) > 0:
top_classes = Counter(rank_above_3_labels).most_common(3)
augmented_top_classes = []
for cls, count in top_classes:
class_in_group_mask = (labels == cls) & rank_above_3_mask
class_probs = probs[class_in_group_mask, cls]
avg_prob = np.mean(class_probs)
augmented_top_classes.append((cls, count, round(float(avg_prob), 4)))
metrics["rank_above_3_details"] = augmented_top_classes
else:
metrics["rank_above_3_details"] = []
mlflow.log_metric("rank_1", metrics["rank_1"])
mlflow.log_metric("rank_2_to_3", metrics["rank_2_to_3"])
mlflow.log_metric("rank_above_3", metrics["rank_above_3"])
mlflow.log_metric("no_match_in_top_k", metrics["no_match_in_top_k"])
# mlflow.log_metric("rank_2_to_3_details", metrics["rank_2_to_3_details"])
# mlflow.log_metric("rank_above_3_details", metrics["rank_above_3_details"])
return metrics
# Training arguments
training_args = TrainingArguments(
output_dir="MAP_EXP_09",
eval_strategy="steps",
save_strategy="no",
logging_strategy="steps",
eval_steps=100,
logging_steps=100,
learning_rate=1e-4,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
gradient_accumulation_steps=1,
lr_scheduler_type="cosine",
warmup_ratio=0.05,
report_to="mlflow",
gradient_checkpointing=True,
group_by_length=True,
max_grad_norm=1.0,
weight_decay=0.01,
num_train_epochs=2
)
import torch
import numpy as np
import mlflow
from collections import Counter
from transformers import Trainer
class MLflowMetricsLogger:
"""
A callable class to compute and log metrics to MLflow with step tracking.
"""
def __init__(self, trainer: Trainer, ks=[3, 5, 10]):
"""
Initializes the metrics logger.
Args:
trainer (Trainer): The Hugging Face Trainer instance.
ks (list): A list of k values for MAP@k calculation.
"""
self.trainer = trainer
self.ks = ks
def __call__(self, eval_pred):
"""
This method is called by the Trainer during evaluation.
"""
# Get the current training step from the trainer's state
step = self.trainer.state.global_step
# 1. Unpack logits and labels
logits, labels = eval_pred
labels = np.array(labels)
# 2. Convert logits to probabilities
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
# 3. Get top-k predictions
max_k = max(self.ks)
top_k_preds = np.argsort(-probs, axis=1)[:, :max_k]
# 4. Create a boolean match array
match_array = (top_k_preds == labels[:, None])
# 5. Compute MAP@k for each specified k
metrics = {}
for k in self.ks:
match_at_k = match_array[:, :k]
ranks = np.argmax(match_at_k, axis=1) + 1
has_match_at_k = np.any(match_at_k, axis=1)
scores = has_match_at_k * (1.0 / ranks)
metrics[f"map@{k}"] = np.mean(scores)
# 6. Calculate detailed rank position breakdown
ranks_with_indices = [np.where(row)[0] for row in match_array]
correct_ranks = np.array([r[0] + 1 if len(r) > 0 else max_k + 1 for r in ranks_with_indices])
total = labels.shape[0]
rank_1_count = np.sum(correct_ranks == 1)
rank_2_to_3_count = np.sum((correct_ranks >= 2) & (correct_ranks <= 3))
rank_above_3_count = np.sum((correct_ranks > 3) & (correct_ranks <= max_k))
no_match_count = np.sum(correct_ranks > max_k)
# Log metrics to MLflow WITH the step argument
mlflow.log_metric("rank_1", rank_1_count, step=step)
mlflow.log_metric("rank_2_to_3", rank_2_to_3_count, step=step)
mlflow.log_metric("rank_above_3", rank_above_3_count, step=step)
mlflow.log_metric("no_match_in_top_k", no_match_count, step=step)
# Note: The detailed lists cannot be logged as a time-series metric.
# These are better logged as artifacts (e.g., a JSON file) or a dictionary
# at the end of the run if needed.
# For example: mlflow.log_dict(details_dict, "rank_details.json")
# The Trainer still requires a dictionary of metrics to be returned.
metrics["rank_1"] = rank_1_count
metrics["rank_2_to_3"] = rank_2_to_3_count
metrics["rank_above_3"] = rank_above_3_count
metrics["no_match_in_top_k"] = no_match_count
metrics["total"] = total
return metrics
# Initialize trainer
trainer = Trainer(
model,
args=training_args,
train_dataset=train_ds,
eval_dataset=eval_ds,
tokenizer=tokenizer,
# compute_metrics=compute_multi_map,
data_collator=DataCollatorWithPadding(tokenizer),
)
metrics_computer = MLflowMetricsLogger(trainer)
# 3. Assign the instance to the trainer's compute_metrics attribute
trainer.compute_metrics = metrics_computer
# Main execution
if __name__ == "__main__":
# Start training
trainer.train()
# Save the model
trainer.save_model("MAP_EXP_10")
source_file = "MAP_EXP_10.py"
destination_directory = "MAP_EXP_10"
shutil.copy(source_file, destination_directory)
print(f"File '{source_file}' copied to '{destination_directory}'")
print("Training completed and model saved!")