Initial model upload for MTEB evaluation

3be07ac verified about 1 month ago

44.6 kB


	# DID NOT USE THE NEW MSMARCO DATSET, OTHERWISE EVERYTHING IS THE SAME





	import logging
	import os
	import argparse
	import config # Assuming your config file is available
	import json
	import torch
	import pandas as pd

	os.environ["HF_HOME"] = config.CACHE_DIR

	from pathlib import Path
	from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset, DatasetDict
	from sentence_transformers.evaluation import (
	SequentialEvaluator,
	EmbeddingSimilarityEvaluator,
	InformationRetrievalEvaluator,
	TripletEvaluator,
	)
	from sentence_transformers.trainer import SentenceTransformerTrainer
	from sentence_transformers.training_args import (
	SentenceTransformerTrainingArguments,
	BatchSamplers,
	MultiDatasetBatchSamplers
	)
	from sentence_transformers import SentenceTransformer, losses, models
	import transformers

	from src.custom_loss.CachedMultipleNegativesRankingLossWithSpreadOutHardnessWeightAndMask import CachedMultipleNegativesRankingLossWithSpreadOutHardnessWeightAndMask
	from src.custom_loss.CachedMultipleNegativesRankingLossWithSpreadOutHardnessWeight import CachedMultipleNegativesRankingLossWithSpreadOutHardnessWeight
	import random

	from sklearn.cluster import MiniBatchKMeans
	from sklearn.metrics import v_measure_score, adjusted_rand_score, accuracy_score, f1_score
	from sentence_transformers.evaluation import SentenceEvaluator
	from sklearn.linear_model import LogisticRegression


	logging.basicConfig(
	level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
	)
	transformers.logging.set_verbosity_info()

	# --- MINED DATASET CONFIGURATION ---
	MINED_DATASETS_BASE_PATH = Path("/mnt/disk2/translated_datasets")



	# Map dataset names to their task categories
	MINED_DATASET_CONFIG = {
	"gooaq_train_swedish_triplets_scored": "question answering",
	"reddit_train_swedish_triplets_scored": "retrieval",
	"xsum_train_swedish_triplets_scored": "clustering",
	"simple-wiki_train_swedish_triplets_scored": "semantic similarity",
	"s2orc_train_swedish_saved_triplets_scored": "retrieval",
	"amazon-reviews_train_swedish_saved_triplets_scored": "retrieval",
	"paq_train_swedish_queries_retranslated_triplets_scored": "question answering",
	"stackexchange-duplicates_train_swedish_triplets_scored": "semantic similarity",
	"wikipedia-sections_train_swedish_triplets_scored": "retrieval",
	"msmarco_triplets_swedish_triplets_scored" : "retrieval",
	}


	synthetic_data_path = "/mnt/disk2/combined_synthetic_data_deduplicated_classified"
	#synthetic_classification_data_path = "/mnt/disk2/classification_dataset_graded_subset.jsonl"
	#combined_synthetic_data_deduplicated_classified


	# Maximum number of negatives to use per example from mined datasets
	# The loss function (CachedMultipleNegativesRankingLoss) can handle multiple negatives
	# Format: (anchor, positive, negative_1, negative_2, ..., negative_n)
	MAX_NEGATIVES_PER_EXAMPLE = 10
	MAX_SAMPLES = 500_000

	NANOBEIR_DATASETS = [
	"NanoArguAna",
	"NanoClimateFEVER",
	"NanoDBPedia",
	"NanoFEVER",
	"NanoFiQA2018",
	"NanoHotpotQA",
	"NanoMSMARCO",
	"NanoNFCorpus",
	"NanoNQ",
	"NanoQuoraRetrieval",
	"NanoSCIDOCS",
	"NanoSciFact",
	"NanoTouche2020",
	]

	# Map dataset names to task types for appropriate prompting
	NANOBEIR_TASK_TYPES = {
	"NanoArguAna": "retrieval",
	"NanoClimateFEVER": "retrieval",
	"NanoDBPedia": "retrieval",
	"NanoFEVER": "retrieval",
	"NanoFiQA2018": "retrieval",
	"NanoHotpotQA": "retrieval",
	"NanoMSMARCO": "retrieval",
	"NanoNFCorpus": "retrieval",
	"NanoNQ": "question answering",
	"NanoQuoraRetrieval": "semantic similarity",
	"NanoSCIDOCS": "retrieval",
	"NanoSciFact": "retrieval",
	"NanoTouche2020": "retrieval",
	}


	class ProbeClassificationEvaluator(SentenceEvaluator):
	"""
	Generic evaluator that trains a Logistic Regression probe on train_set
	and evaluates accuracy on test_set.
	"""
	def __init__(self, dataset_name, sentences_train, labels_train, sentences_test, labels_test, batch_size=32, prompt_prefix=""):
	self.name = f"eval_{dataset_name}_classification"
	self.sentences_train = [prompt_prefix + s for s in sentences_train]
	self.labels_train = labels_train
	self.sentences_test = [prompt_prefix + s for s in sentences_test]
	self.labels_test = labels_test
	self.batch_size = batch_size

	def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
	# 1. Encode
	emb_train = model.encode(self.sentences_train, batch_size=self.batch_size, show_progress_bar=False)
	emb_test = model.encode(self.sentences_test, batch_size=self.batch_size, show_progress_bar=False)

	# 2. Train Probe (Fast LR)
	clf = LogisticRegression(random_state=42, solver='lbfgs', max_iter=100, n_jobs=-1)
	clf.fit(emb_train, self.labels_train)

	# 3. Predict
	preds = clf.predict(emb_test)
	acc = accuracy_score(self.labels_test, preds)

	logging.info(f"{self.name}: Accuracy = {acc:.4f}")
	return acc

	def load_swedish_reviews_evaluator(prompt_style="standard"):
	"""
	Loads 'timpal0l/swedish_reviews' for binary sentiment classification.
	Safe from MTEB leakage.
	"""
	try:
	logging.info("Loading timpal0l/swedish_reviews for Sentiment Probe...")
	# Load the test split (approx 10k samples, good for eval)
	dataset = load_dataset("timpal0l/swedish_reviews", split="test")

	# Downsample if needed for speed (e.g. keep 2000 samples)
	if len(dataset) > 50:
	dataset = dataset.shuffle(seed=42).select(range(50))

	sentences = dataset["text"]
	labels = dataset["label"]

	# Define Prompt
	if prompt_style == "standard":
	# "task: classification" is the standard prompt for this
	prompt = "task: classification \| query: "
	else:
	prompt = ""

	# Split 50/50 for Train/Test Probe
	# We need to train the logistic regression probe on some data to test the embeddings
	split_idx = len(sentences) // 2

	evaluator = ProbeClassificationEvaluator(
	dataset_name="SwedishReviews-Sentiment",
	sentences_train=sentences[:split_idx],
	labels_train=labels[:split_idx],
	sentences_test=sentences[split_idx:],
	labels_test=labels[split_idx:],
	prompt_prefix=prompt,
	batch_size=32
	)

	return evaluator

	except Exception as e:
	logging.warning(f"Failed to load Swedish Reviews: {e}")
	return None


	class CustomJSONLClassificationEvaluator(SentenceEvaluator):
	"""
	Reads a JSONL file and trains a Logistic Regression probe to predict 'main_title'.
	This checks if the topics are linearly separable in the embedding space.
	"""
	def __init__(self, file_path, min_samples_per_label=5, max_classes=20, batch_size=32):
	self.name = "eval_wiki_classification"
	self.batch_size = batch_size
	self.prompt = "task: classification \| query: "

	self.sentences_train = []
	self.labels_train = []
	self.sentences_test = []
	self.labels_test = []

	logging.info(f"Loading classification data from {file_path}...")

	# 1. Load Data by Class
	label_map = {} # { "Afrika": ["text1", "text2"...] }

	with open(file_path, 'r', encoding='utf-8') as f:
	for line in f:
	try:
	row = json.loads(line)
	text = row.get('text', '').strip()
	label = row.get('main_title', '').strip()

	if text and label:
	if label not in label_map:
	label_map[label] = []
	label_map[label].append(text)
	except:
	continue

	# 2. Filter Classes
	valid_labels = [l for l, texts in label_map.items() if len(texts) >= min_samples_per_label]
	valid_labels.sort() # Deterministic order

	# 3. Subsample Classes (Avoid training on 5000 classes)
	rng = random.Random(42)
	if len(valid_labels) > max_classes:
	selected_labels = rng.sample(valid_labels, max_classes)
	else:
	selected_labels = valid_labels

	logging.info(f"Classification Probe: Using {len(selected_labels)} classes (topics).")

	# 4. Create Train/Test Split (80/20 per class)
	for label in selected_labels:
	texts = label_map[label]
	# Shuffle texts for this label
	rng.shuffle(texts)

	# Use max 50 samples per class to keep probe fast
	texts = texts[:50]

	split_idx = int(0.8 * len(texts))

	# Train set
	for t in texts[:split_idx]:
	self.sentences_train.append(self.prompt + t)
	self.labels_train.append(label)

	# Test set
	for t in texts[split_idx:]:
	self.sentences_test.append(self.prompt + t)
	self.labels_test.append(label)

	logging.info(f"Probe Sizes: Train={len(self.sentences_train)}, Test={len(self.sentences_test)}")

	def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
	if not self.sentences_train:
	return 0.0

	# 1. Encode
	emb_train = model.encode(self.sentences_train, batch_size=self.batch_size, show_progress_bar=False)
	emb_test = model.encode(self.sentences_test, batch_size=self.batch_size, show_progress_bar=False)

	# 2. Train Probe (Fast Logistic Regression)
	clf = LogisticRegression(random_state=42, solver='lbfgs', max_iter=100, n_jobs=-1)
	clf.fit(emb_train, self.labels_train)

	# 3. Predict & Score
	preds = clf.predict(emb_test)
	acc = accuracy_score(self.labels_test, preds)
	# Macro F1 is better if classes are imbalanced
	f1 = f1_score(self.labels_test, preds, average='macro')

	logging.info(f"{self.name}: Accuracy={acc:.4f} \| F1-Macro={f1:.4f}")

	return acc


	class CustomJSONLClusteringEvaluator(SentenceEvaluator):
	"""
	Evaluates clustering on a specific number of distinct Wikipedia topics.
	"""
	def __init__(self, file_path, min_samples_per_topic=5, max_clusters=50, batch_size=32):
	self.name = "eval_wiki_clustering"
	self.batch_size = batch_size
	self.prompt = "task: clustering \| query: "

	self.sentences = []
	self.labels = []

	logging.info(f"Loading clustering data from {file_path}...")

	# 1. Load ALL data into memory organized by Topic
	# Structure: { "Afrika": ["text1", "text2"...], "Amager": ["text1"...] }
	topic_map = {}

	with open(file_path, 'r', encoding='utf-8') as f:
	for line in f:
	try:
	row = json.loads(line)
	text = row.get('text', '').strip()
	# Use 'main_title' as the Topic Label
	label = row.get('main_title', '').strip()

	if text and label:
	if label not in topic_map:
	topic_map[label] = []
	topic_map[label].append(text)
	except:
	continue

	# 2. Filter topics that are too small
	# We need topics with enough content to actually cluster meaningful points
	valid_topics = [
	topic for topic, texts in topic_map.items()
	if len(texts) >= min_samples_per_topic
	]

	logging.info(f"Found {len(valid_topics)} valid topics with >= {min_samples_per_topic} samples.")

	# 3. Sample exactly 'max_clusters' topics (e.g., 50)
	# We sort first to make the random seed deterministic across runs
	valid_topics.sort()

	# Use a fixed seed so the "random" 50 topics are the same every time you restart training
	rng = random.Random(42)

	if len(valid_topics) > max_clusters:
	selected_topics = rng.sample(valid_topics, max_clusters)
	logging.info(f"Subsampled to {max_clusters} distinct topics for evaluation.")
	else:
	selected_topics = valid_topics
	logging.info(f"Using all {len(selected_topics)} topics (fewer than max_clusters).")

	# 4. Flatten the data for the evaluator
	for topic in selected_topics:
	# You can also limit samples per topic here (e.g. max 20 paragraphs per topic) to keep it balanced
	texts = topic_map[topic][:20]
	for text in texts:
	self.sentences.append(self.prompt + text)
	self.labels.append(topic)

	self.n_clusters = len(selected_topics)
	logging.info(f"Final Clustering Probe: {len(self.sentences)} samples across {self.n_clusters} topics.")

	def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
	if not self.sentences:
	return 0.0

	# Encode
	embeddings = model.encode(self.sentences, batch_size=self.batch_size, show_progress_bar=False)

	# Cluster
	clustering = MiniBatchKMeans(
	n_clusters=self.n_clusters,
	batch_size=256,
	random_state=42,
	n_init='auto'
	)
	clustering.fit(embeddings)

	# Score
	v_score = v_measure_score(self.labels, clustering.labels_)
	ari_score = adjusted_rand_score(self.labels, clustering.labels_)

	logging.info(f"{self.name}: V-Measure={v_score:.4f} \| ARI={ari_score:.4f}")

	return v_score

	def load_nanobeir_evaluator(dataset_name, language, task_type="retrieval"):
	"""
	Load a NanoBEIR dataset for a specific language and create an InformationRetrievalEvaluator.

	Args:
	dataset_name: Name of the NanoBEIR dataset (e.g., "NanoMSMARCO")
	language: Language code ("sv" for Swedish, "no" for Norwegian)
	task_type: Task type for prompting ("retrieval", "question answering", "semantic similarity")

	Returns:
	InformationRetrievalEvaluator or None if loading fails
	"""
	try:
	logging.info(f"Loading {dataset_name} for language: {language}")

	# Load corpus (documents)
	corpus_data = load_dataset(
	"lightonai/nanobeir-multilingual",
	f"{dataset_name}_{language}",
	split="corpus"
	)

	# Load queries
	queries_data = load_dataset(
	"lightonai/nanobeir-multilingual",
	f"{dataset_name}_{language}",
	split="queries"
	)

	# Load qrels (relevance judgments) - language independent
	qrels_data = load_dataset(
	"lightonai/nanobeir-multilingual",
	dataset_name,
	split="qrels"
	)

	# Apply task-specific prompts
	if task_type == "retrieval":
	query_prompt = "task: search result \| query: "
	doc_prompt = "title: none \| text: "
	elif task_type == "question answering":
	query_prompt = "task: question answering \| query: "
	doc_prompt = "title: none \| text: "
	elif task_type == "semantic similarity":
	query_prompt = "task: semantic similarity \| query: "
	doc_prompt = "task: semantic similarity \| query: "
	else:
	query_prompt = ""
	doc_prompt = ""

	# Build corpus dictionary with prompts
	corpus = {}
	for item in corpus_data:
	doc_id = item['_id']
	# Combine title and text if title exists
	if 'title' in item and item['title']:
	text = f"{item['title']} {item['text']}"
	else:
	text = item['text']

	corpus[doc_id] = doc_prompt + text

	# Build queries dictionary with prompts
	queries = {}
	for item in queries_data:
	query_id = item['_id']
	queries[query_id] = query_prompt + item['text']

	# Build relevance dictionary
	# Note: NanoBEIR qrels only have query-id and corpus-id (no score column)
	# All entries in qrels are considered relevant
	relevant_docs = {}
	for item in qrels_data:
	query_id = item['query-id']
	corpus_id = item['corpus-id']

	if query_id not in relevant_docs:
	relevant_docs[query_id] = set()
	relevant_docs[query_id].add(corpus_id)

	# Filter queries to only those with relevant documents
	queries = {qid: text for qid, text in queries.items() if qid in relevant_docs}

	if not queries:
	logging.warning(f"No queries with relevant documents found for {dataset_name}_{language}")
	return None

	# Create evaluator
	evaluator_name = f"{dataset_name}-{language}-dev"
	evaluator = InformationRetrievalEvaluator(
	queries=queries,
	corpus=corpus,
	relevant_docs=relevant_docs,
	name=evaluator_name,
	# Only compute NDCG@10 to reduce metric clutter
	mrr_at_k=[10], # Disable MRR
	ndcg_at_k=[10], # Only NDCG@10
	accuracy_at_k=[1], # Disable accuracy
	precision_recall_at_k=[1], # Disable precision/recall
	map_at_k=[100], # Disable MAP
	)

	logging.info(f"Created {evaluator_name} with {len(queries)} queries and {len(corpus)} documents")
	return evaluator

	except Exception as e:
	logging.error(f"Failed to load {dataset_name} for {language}: {e}")
	return None


	def create_nanobeir_evaluators(languages=["sv", "no"], dataset_names=None):
	"""
	Create InformationRetrievalEvaluators for multiple NanoBEIR datasets and languages.

	Args:
	languages: List of language codes (default: ["sv", "no"])
	dataset_names: List of dataset names to use (default: all NANOBEIR_DATASETS)

	Returns:
	List of InformationRetrievalEvaluators
	"""
	if dataset_names is None:
	dataset_names = NANOBEIR_DATASETS

	evaluators = []

	for language in languages:
	logging.info(f"\n=== Loading NanoBEIR datasets for {language.upper()} ===")
	for dataset_name in dataset_names:
	task_type = NANOBEIR_TASK_TYPES.get(dataset_name, "retrieval")
	evaluator = load_nanobeir_evaluator(dataset_name, language, task_type)
	if evaluator is not None:
	evaluators.append(evaluator)

	logging.info(f"\nSuccessfully created {len(evaluators)} NanoBEIR evaluators")
	return evaluators



	def handle_none_negatives(example):
	"""Replaces None in the 'negative' column with an empty string."""
	if example["negative"] is None:
	example["negative"] = ""
	return example


	def is_good_or_excellent(example):
	"""Filter function to keep only good or excellent graded examples."""
	if "dialect" in example["task_description"].lower():
	return False
	return example['grade'] in ['good', 'excellent']

	def prepare_triplet_eval_data(dev_set):
	"""
	Prepares anchors, positives, and negatives for a TripletEvaluator.
	Extracts raw data WITHOUT prompts - prompts will be applied separately.
	Handles both 'negative' and 'negative_1' column names.
	"""
	anchors = dev_set["anchor"]
	positives = dev_set["positive"]

	# Handle both column naming conventions
	if "negative" in dev_set.column_names:
	negatives = dev_set["negative"]
	elif "negative_1" in dev_set.column_names:
	negatives = dev_set["negative_1"]
	else:
	raise ValueError(f"No negative column found. Available columns: {dev_set.column_names}")

	return anchors, positives, negatives


	def load_clustering_dataset(dataset_path, max_negatives=5):
	"""
	Loads the pre-saved Clustering dataset from disk.
	Applies clustering prompts and handles schema consistency for DDP.
	"""
	if not os.path.exists(dataset_path):
	logging.warning(f"Clustering data not found at: {dataset_path}")
	return None

	logging.info(f"Loading clustering dataset from disk: {dataset_path}")
	dataset = load_from_disk(str(dataset_path))

	# 1. Ensure we don't have None values (Tokenizer crash protection)
	def fill_empty(example):
	for key in example.keys():
	if example[key] is None:
	example[key] = ""
	return example

	dataset = dataset.map(fill_empty)

	# 2. Add clustering category and apply prompts BEFORE padding
	dataset = dataset.add_column("new_category", ["clustering"] * len(dataset))
	dataset = dataset.map(apply_task_prompt)
	dataset = dataset.remove_columns(['new_category'])

	# 3. Enforce exact column schema for DDP - pad AFTER applying prompts
	desired_cols = ["anchor", "positive"] + [f"negative_{i+1}" for i in range(max_negatives)]

	# Pad if we have fewer negatives than the global config
	for col in desired_cols:
	if col not in dataset.column_names:
	dataset = dataset.add_column(col, [""] * len(dataset))

	# Select only the needed columns (drops extra negatives if you have > max)
	dataset = dataset.select_columns(desired_cols)

	logging.info(f"Loaded {len(dataset)} clustering examples with prompts applied.")
	return dataset


	def apply_task_prompt(example, dropout_rate=0.1):
	"""
	Applies strict task-specific prompts based on the provided schema.
	Skips empty strings to avoid adding prompts to padding columns.
	"""
	task_type = example["new_category"]

	if random.random() < dropout_rate:
	return example

	# --- 1. RETRIEVAL (Asymmetric) ---
	# Anchor: "task: search result \| query: {content}"
	# Docs: "title: none \| text: {content}"
	if task_type == "retrieval":
	if example['anchor']: # Only apply if not empty
	example['anchor'] = "task: search result \| query: " + example['anchor']

	# Define the document prompt (assuming no title column available, using 'none')
	doc_prompt = "title: none \| text: "

	if example['positive']: # Only apply if not empty
	example['positive'] = doc_prompt + example['positive']

	# Apply to all negatives
	for key in list(example.keys()):
	if key.startswith('negative') and example[key]: # This already checks for non-empty
	example[key] = doc_prompt + example[key]

	# --- 2. QUESTION ANSWERING (Symmetric) ---
	# User Request: Same prompt for anchor and positive.
	# Prompt: "task: question answering \| query: {content}"
	elif task_type == "question answering":
	instruct = "task: question answering \| query: "

	if example['anchor']:
	example['anchor'] = instruct + example['anchor']
	if example['positive']:
	example['positive'] = instruct + example['positive']

	for key in list(example.keys()):
	if key.startswith('negative') and example[key]:
	example[key] = instruct + example[key]

	# --- 3. CLUSTERING (Symmetric) ---
	# Prompt: "task: clustering \| query: {content}"
	elif task_type == "clustering":
	instruct = "task: clustering \| query: "

	if example['anchor']:
	example['anchor'] = instruct + example['anchor']
	if example['positive']:
	example['positive'] = instruct + example['positive']

	for key in list(example.keys()):
	if key.startswith('negative') and example[key]:
	example[key] = instruct + example[key]

	# --- 4. CLASSIFICATION (Symmetric) ---
	# Prompt: "task: classification \| query: {content}"
	elif task_type == "classification":
	instruct = "task: classification \| query: "

	if example['anchor']:
	example['anchor'] = instruct + example['anchor']
	if example['positive']:
	example['positive'] = instruct + example['positive']

	for key in list(example.keys()):
	if key.startswith('negative') and example[key]:
	example[key] = instruct + example[key]

	# --- 5. SEMANTIC SIMILARITY (Symmetric) ---
	# Prompt: "task: sentence similarity \| query: {content}"
	elif task_type == "semantic similarity":
	instruct = "task: semantic similarity \| query: "

	if example['anchor']:
	example['anchor'] = instruct + example['anchor']
	if example['positive']:
	example['positive'] = instruct + example['positive']

	for key in list(example.keys()):
	if key.startswith('negative') and example[key]:
	example[key] = instruct + example[key]

	# Fallback
	else:
	logging.warning(f"Unknown task category: {task_type}. No prompt applied.")

	return example


	def load_mined_dataset(dataset_name, task_category, max_samples=None, max_negatives=10):
	"""
	Load a single mined dataset and prepare it for training.

	Args:
	dataset_name: Name of the dataset directory
	task_category: Task category for prompting
	max_samples: Optional limit on number of samples to load
	max_negatives: Maximum number of negatives to include per example (default: 10)

	Returns:
	Processed dataset with prompts applied, with multiple negatives per example
	"""
	dataset_path = MINED_DATASETS_BASE_PATH / dataset_name

	if not dataset_path.exists():
	logging.warning(f"Dataset path does not exist: {dataset_path}. Skipping.")
	return None

	logging.info(f"Loading mined dataset from: {dataset_path}")
	dataset = load_from_disk(str(dataset_path))

	if max_samples and len(dataset) > max_samples:
	logging.info(f"Sampling {max_samples} from {len(dataset)} samples")
	dataset = dataset.shuffle(seed=42).select(range(max_samples))

	logging.info(f"Loaded {len(dataset)} samples from {dataset_name}")

	# The mined datasets have structure: anchor, positive, pos_score, negatives (list), neg_scores (list)
	# We need to convert to: anchor, positive, negative_1, negative_2, ..., negative_n
	# BUT we only keep negatives that actually exist (no None/empty values)


	def expand_negatives(example):
	"""
	Convert the list of negatives into separate columns.
	Ensures ALL columns from negative_1 to negative_{max_negatives} exist.
	Pads missing negatives with None.
	"""
	result = {
	'anchor': example['anchor'],
	'positive': example['positive'],
	}

	# Get the list of negatives, ensure it's a list
	raw_negatives = example.get('negatives', [])
	if raw_negatives is None:
	raw_negatives = []

	# Filter out empty strings or None values from the source list
	valid_negatives = [n for n in raw_negatives if n]

	# --- KEY FIX IS HERE ---
	# We must iterate up to max_negatives every time to ensure the
	# dictionary keys (schema) are identical for every row.
	for i in range(max_negatives):
	key_name = f'negative_{i+1}'

	if i < len(valid_negatives):
	# We have a valid negative
	result[key_name] = valid_negatives[i]
	else:
	# We ran out of negatives -> Fill with None
	# This ensures the column exists in the Arrow table
	result[key_name] = ''

	# Check if we have at least one negative
	result['_has_negatives'] = len(valid_negatives) > 0

	return result

	dataset = dataset.map(expand_negatives, remove_columns=dataset.column_names)

	# Filter out any examples without valid negatives
	dataset = dataset.filter(lambda x: x['_has_negatives'])
	dataset = dataset.remove_columns(['_has_negatives'])

	if len(dataset) == 0:
	logging.warning(f"No valid examples with negatives found in {dataset_name}")
	return None

	# Add task category for prompting
	dataset = dataset.add_column("new_category", [task_category] * len(dataset))
	dataset = dataset.map(apply_task_prompt)
	dataset = dataset.remove_columns(['new_category'])

	# Count negatives in first example for logging
	sample = dataset[0]
	num_negatives = sum(1 for k in sample.keys() if k.startswith('negative_'))

	logging.info(f"Prepared {len(dataset)} training examples from {dataset_name}")
	logging.info(f"Examples have between 1 and {num_negatives} negatives each")


	desired_columns = ["anchor", "positive"]

	# scan for all negative columns that actually exist in the dataset
	existing_columns = dataset.column_names
	negative_cols = [c for c in existing_columns if c.startswith("negative_")]

	# Sort them to ensure negative_1 comes before negative_2, etc.
	# We sort by the integer number in the column name
	negative_cols.sort(key=lambda x: int(x.split('_')[1]))

	desired_columns.extend(negative_cols)

	# 2. Force the dataset to use this specific order
	dataset = dataset.select_columns(desired_columns)

	logging.info(f"Enforced column order: {dataset.column_names}")

	return dataset


	def pad_dataset_schema(dataset, total_negatives=5):
	"""
	Ensures the dataset has columns negative_1 to negative_{total_negatives}.
	Fills missing columns with empty strings.
	"""
	new_columns = {}
	existing_cols = dataset.column_names

	for i in range(total_negatives):
	col_name = f"negative_{i+1}"
	if col_name not in existing_cols:
	# Create a column of empty strings efficiently
	new_columns[col_name] = [""] * len(dataset)

	if new_columns:
	# Add all new columns at once
	from datasets import Dataset
	# We need to concatenate or add_column.
	# For efficiency with HF datasets, simpler to just add column by column or map
	for col_name, data in new_columns.items():
	dataset = dataset.add_column(col_name, data)

	return dataset


	def main(args):
	run_type = "probe" if args.probe_run else "full"
	base_model_name = Path(args.fine_tune_model_path).name
	data_name = "ms_marco_nli_mined"

	run_name = f"finetune-CachedMNRL-{base_model_name}-on-{data_name}-{run_type}"
	output_dir = os.path.join(config.OUTPUT_DIR, run_name)

	logging.info(f"--- Starting {run_type.upper()} Run: {run_name} ---")
	logging.info(f"Fine-tuning model from: {args.fine_tune_model_path}")

	model = SentenceTransformer(args.fine_tune_model_path)

	logging.info(model)

	logging.info("Patching the model's `tokenize` method to remove token_type_ids...")
	original_tokenize = model.tokenize

	def patched_tokenize(args, *kwargs):
	# Call the original tokenizer to get the encoded inputs
	tokenized_output = original_tokenize(args, *kwargs)

	# Remove the unwanted key from the output
	if "token_type_ids" in tokenized_output:
	del tokenized_output["token_type_ids"]

	return tokenized_output

	# Replace the original method with our patched version
	model.tokenize = patched_tokenize
	logging.info("Model's `tokenize` method patched successfully.")

	# Tokenizer patch can remain
	logging.info(f"Setting model max length to {config.FT_MODEL_MAX_LENGTH}")
	model.max_seq_length = config.FT_MODEL_MAX_LENGTH

	# === LOAD ORIGINAL DATASETS ===
	nli_data_path = "/mnt/disk2/snli_triplets_swedish" # sts
	msmarco_data_path = "/mnt/disk2/msmarco_triplets_swedish" # retrieval
	nq_data_path = "/mnt/disk2/nq_triplets_swedish" # retrieval

	# Load NLI data
	logging.info(f"Loading translated NLI triplets from: {nli_data_path}")
	nli_dataset = load_from_disk(nli_data_path)
	nli_dataset = nli_dataset.add_column("new_category", ["semantic similarity"] * len(nli_dataset))
	nli_dataset = nli_dataset.map(apply_task_prompt)
	nli_dataset = nli_dataset.rename_column("negative", "negative_1")
	nli_dataset = pad_dataset_schema(nli_dataset, MAX_NEGATIVES_PER_EXAMPLE) # <--- ADD THIS
	nli_dataset = nli_dataset.select_columns(["anchor", "positive"] + [f"negative_{i+1}" for i in range(MAX_NEGATIVES_PER_EXAMPLE)])
	logging.info(f"Loaded {len(nli_dataset)} NLI triplets.")

	# Load MS MARCO data
	# logging.info(f"Loading translated MSMARCO triplets from: {msmarco_data_path}")
	# msmarco_dataset = load_from_disk(msmarco_data_path)
	# msmarco_dataset = msmarco_dataset.rename_column("query", "anchor")
	# msmarco_dataset = msmarco_dataset.add_column("new_category", ["retrieval"] * len(msmarco_dataset))
	# msmarco_dataset = msmarco_dataset.map(apply_task_prompt)
	# msmarco_dataset = msmarco_dataset.rename_column("negative", "negative_1")
	# msmarco_dataset = pad_dataset_schema(msmarco_dataset, MAX_NEGATIVES_PER_EXAMPLE) # <--- ADD THIS
	# msmarco_dataset = msmarco_dataset.select_columns(["anchor", "positive"] + [f"negative_{i+1}" for i in range(MAX_NEGATIVES_PER_EXAMPLE)])
	# logging.info(f"Loaded {len(msmarco_dataset)} MSMARCO triplets.")

	# Load NQ data
	logging.info(f"Loading translated NQ triplets from: {nq_data_path}")
	nq_dataset = load_from_disk(nq_data_path)
	nq_dataset = nq_dataset.rename_column("query", "anchor")
	nq_dataset = nq_dataset.add_column("new_category", ["question answering"] * len(nq_dataset))
	nq_dataset = nq_dataset.map(apply_task_prompt)
	nq_dataset = nq_dataset.rename_column("negative", "negative_1")
	nq_dataset = pad_dataset_schema(nq_dataset, MAX_NEGATIVES_PER_EXAMPLE) # <--- ADD THIS
	nq_dataset = nq_dataset.select_columns(["anchor", "positive"] + [f"negative_{i+1}" for i in range(MAX_NEGATIVES_PER_EXAMPLE)])
	logging.info(f"Loaded {len(nq_dataset)} NQ triplets.")


	logging.info(f"Loading synthetic data from: {synthetic_data_path}")
	synthetic_dataset = load_from_disk(synthetic_data_path)
	synthetic_dataset = synthetic_dataset.filter(is_good_or_excellent)
	synthetic_dataset = synthetic_dataset.map(handle_none_negatives)
	synthetic_dataset = synthetic_dataset.rename_column("query", "anchor")
	synthetic_dataset = synthetic_dataset.map(apply_task_prompt)
	synthetic_dataset = synthetic_dataset.rename_column("negative", "negative_1")
	synthetic_dataset = pad_dataset_schema(synthetic_dataset, MAX_NEGATIVES_PER_EXAMPLE)
	synthetic_dataset = synthetic_dataset.select_columns(["anchor", "positive"] + [f"negative_{i+1}" for i in range(MAX_NEGATIVES_PER_EXAMPLE)])
	logging.info(f"Loaded {len(synthetic_dataset)} synthetic triplets.")

	# Load wiki clustering data
	cluster_data_path = "/home/ubuntu/work/WSCLToolkit/sent_emb_train/non_vital_code/final_clustering_data.jsonl/"

	cluster_dataset = load_clustering_dataset(
	cluster_data_path,
	max_negatives=MAX_NEGATIVES_PER_EXAMPLE
	)

	# === LOAD MINED DATASETS ===
	logging.info("\n=== Loading Mined Hard Negative Datasets ===")
	logging.info(f"Using up to {MAX_NEGATIVES_PER_EXAMPLE} negatives per example")
	mined_datasets = {}

	for dataset_name, task_category in MINED_DATASET_CONFIG.items():
	dataset = load_mined_dataset(
	dataset_name=dataset_name,
	task_category=task_category,
	max_samples=MAX_SAMPLES, # Set to a number if you want to limit samples per dataset
	max_negatives=MAX_NEGATIVES_PER_EXAMPLE
	)
	if dataset is not None:
	mined_datasets[dataset_name] = dataset

	logging.info(f"Successfully loaded {len(mined_datasets)} mined datasets")

	# === SPLIT DATASETS FOR EVALUATION ===
	logging.info("\n=== Splitting datasets to create dev sets for TripletEvaluators ===")
	eval_samples_per_dataset = 1000

	# Split NLI
	nli_split = nli_dataset.train_test_split(test_size=eval_samples_per_dataset, seed=42)
	nli_train = nli_split["train"]
	nli_dev = nli_split["test"]

	# Split MS MARCO
	# msmarco_split = msmarco_dataset.train_test_split(
	# test_size=eval_samples_per_dataset, seed=42
	# )
	# msmarco_train = msmarco_split["train"]
	# msmarco_dev = msmarco_split["test"]

	# Split NQ
	nq_split = nq_dataset.train_test_split(test_size=eval_samples_per_dataset, seed=42)
	nq_train = nq_split["train"]
	nq_dev = nq_split["test"]

	# === COMBINE ALL DATASETS ===
	# Use DatasetDict for stratified sampling across datasets
	datasets = {}

	# Add original datasets
	datasets["NLI"] = nli_train
	#datasets["MSMARCO"] = msmarco_train
	datasets["NQ"] = nq_train
	datasets["Topic_Clustering"] = cluster_dataset
	datasets["Synthetic"] = synthetic_dataset
	# datasets["Synthetic_Classification"] = synthetic_classification_dataset

	# Add all mined datasets
	datasets.update(mined_datasets)

	# Convert to DatasetDict for stratified batch sampling
	train_dataset = DatasetDict(datasets)

	logging.info(f"Created training DatasetDict with {len(train_dataset)} datasets")

	# === DATASET COMPOSITION TABLE ===
	# For DatasetDict, iterate over the dataset dict
	total_samples = sum(len(ds) for ds in train_dataset.values())

	composition_data = []
	for name, ds in train_dataset.items():
	num_samples = len(ds)
	percentage = (num_samples / total_samples) * 100 if total_samples > 0 else 0
	composition_data.append({
	"Dataset": name,
	"Number of Examples": num_samples,
	"Percentage (%)": f"{percentage:.2f}%"
	})

	print("\n📊 Finetuning Dataset Composition")
	print("-" * 80)
	print(f"{'Dataset':<50} \| {'Number of Examples':<20} \| {'Percentage (%)'}")
	print("-" * 80)
	for item in composition_data:
	print(f"{item['Dataset']:<50} \| {item['Number of Examples']:<20} \| {item['Percentage (%)']}")
	print("-" * 80)
	print(f"{'Total':<50} \| {total_samples:<20} \| {'100.00%'}")
	print("-" * 80)

	logging.info(f"Training with DatasetDict containing {len(train_dataset)} datasets and {total_samples} total samples.")

	# === CONFIGURE LOSS ===
	loss = CachedMultipleNegativesRankingLossWithSpreadOutHardnessWeightAndMask(
	model=model,
	mini_batch_size=config.FT_BS,
	spread_out_loss_weight=0.1,
	use_hardness_weighting=True,
	mask_duplicate_positives=False,
	hardness_alpha=3.0,
	scale=50,
	)

	run_name = run_name + "-added datasets" + f"max_{MAX_SAMPLES}_per_dataset"

	# NLI Evaluator (semantic similarity) - prompts already applied
	nli_anchors, nli_pos, nli_neg = prepare_triplet_eval_data(nli_dev)
	nli_triplet_evaluator = TripletEvaluator(nli_anchors, nli_pos, nli_neg, name="nli-triplet-dev")

	# MS MARCO Evaluator (retrieval - asymmetric) - prompts already applied
	# msmarco_anchors, msmarco_pos, msmarco_neg = prepare_triplet_eval_data(msmarco_dev)
	# msmarco_triplet_evaluator = TripletEvaluator(msmarco_anchors, msmarco_pos, msmarco_neg, name="msmarco-triplet-dev")

	# NQ Evaluator (question answering) - prompts already applied
	nq_anchors, nq_pos, nq_neg = prepare_triplet_eval_data(nq_dev)
	nq_triplet_evaluator = TripletEvaluator(nq_anchors, nq_pos, nq_neg, name="nq-triplet-dev")


	wiki_cluster_eval = CustomJSONLClusteringEvaluator(
	file_path="/home/ubuntu/work/WSCLToolkit/sent_emb_train/non_vital_code/parsed_wiki_sections.jsonl",
	min_samples_per_topic=5,
	max_clusters=500 # <--- Takes 50 random topics (e.g. Afrika, Amager, Kaffe, Volvo...)
	)

	sentiment_eval = load_swedish_reviews_evaluator(prompt_style="standard")

	# Paraphrase (STS) Evaluator
	print("\n--- Setting up Paraphrase Evaluator from JSONL ---")
	para_sents1 = []
	para_sents2 = []
	para_scores = []
	max_score = 0.0

	with open(config.SWE_PARA_PATH, "r", encoding="utf-8") as f:
	for line in f:
	data = json.loads(line)
	para_sents1.append(data["sentence_1"])
	para_sents2.append(data["sentence_2"])
	score = float(data["label"])
	para_scores.append(score)
	if score > max_score:
	max_score = score

	if max_score > 0:
	normalized_scores = [score / max_score for score in para_scores]
	else:
	normalized_scores = para_scores

	# Apply prompts to paraphrase data
	para_sents1 = ["task: semantic similarity \| query: " + s for s in para_sents1]
	para_sents2 = ["task: semantic similarity \| query: " + s for s in para_sents2]

	sweparaphrase_evaluator = EmbeddingSimilarityEvaluator(
	sentences1=para_sents1,
	sentences2=para_sents2,
	scores=normalized_scores,
	name="sweparaphrase-dev",
	)

	# FAQ Retrieval Evaluator
	print("\n--- Setting up FAQ Retrieval Evaluator from JSONL ---")

	with open(config.SWE_FAQ_PATH, "r", encoding="utf-8") as f:
	faq_data = [json.loads(line) for line in f]

	unique_answers = {answer for item in faq_data for answer in item["candidate_answers"]}
	answer_to_cid = {answer: f"doc_{i}" for i, answer in enumerate(unique_answers)}

	# Apply prompts to FAQ corpus (documents)
	faq_corpus = {
	cid: "title: none \| text: " + answer
	for answer, cid in answer_to_cid.items()
	}

	faq_queries = {}
	faq_relevant_docs = {}
	for i, item in enumerate(faq_data):
	query_id = f"q_{i}"
	faq_queries[query_id] = "task: search result \| query: " + item['question']
	correct_answer = item["candidate_answers"][item["label"]]
	correct_cid = answer_to_cid[correct_answer]
	faq_relevant_docs[query_id] = {correct_cid}

	swefaq_evaluator = InformationRetrievalEvaluator(
	queries=faq_queries,
	corpus=faq_corpus,
	relevant_docs=faq_relevant_docs,
	name="swefaq-dev",
	)


	nanobeir_evaluators = create_nanobeir_evaluators(
	languages=["sv", "no"],
	dataset_names=[
	"NanoMSMARCO",
	"NanoNQ",
	"NanoQuoraRetrieval",
	"NanoFEVER",
	"NanoHotpotQA"
	]
	)

	main_evaluator = SequentialEvaluator(
	evaluators=[
	nli_triplet_evaluator,
	# msmarco_triplet_evaluator,
	nq_triplet_evaluator,
	sweparaphrase_evaluator,
	swefaq_evaluator,
	wiki_cluster_eval,
	sentiment_eval
	] + nanobeir_evaluators,
	)
	print("\nAll evaluators combined and ready.")

	# === CONFIGURE TRAINING ARGUMENTS ===
	training_args = SentenceTransformerTrainingArguments(
	output_dir=config.FT_OUTPUT_DIR,
	num_train_epochs=1,
	per_device_eval_batch_size=64,
	per_device_train_batch_size=config.FT_CONCEPTUAL_BS,
	learning_rate=config.FT_LR,
	bf16=True,
	report_to="wandb",
	run_name=run_name,
	save_total_limit=2,
	logging_steps=config.LOGGING_STEPS,
	eval_strategy=config.EVAL_STRATEGY,
	save_strategy=config.EVAL_STRATEGY,
	eval_steps=config.FT_EVAL_STEPS,
	save_steps=config.FT_SAVE_STEPS,
	load_best_model_at_end=True,
	warmup_ratio=config.FT_WARMUP,
	weight_decay=config.FT_WEIGHT_DECAY,
	metric_for_best_model="eval_msmarco-triplet-dev_cosine_accuracy",
	greater_is_better=True,
	# Enable multi-dataset batch sampling for stratified batches
	multi_dataset_batch_sampler=MultiDatasetBatchSamplers.PROPORTIONAL,
	ddp_find_unused_parameters=False,
	)

	# === INITIALIZE AND RUN TRAINER ===
	trainer = SentenceTransformerTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	loss=loss,
	evaluator=main_evaluator,
	)

	logging.info("--- Running initial evaluation on the untrained model (step 0) ---")
	trainer.evaluate()

	logging.info(f"Starting model fine-tuning for run: {run_name}")
	trainer.train()
	logging.info(f"--- Fine-tuning complete for {run_name}! ---")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="A flexible fine-tuning script for Sentence Transformer models."
	)

	parser.add_argument(
	"--fine_tune_model_path",
	type=str,
	required=True,
	help="Path or Hub name of the model to fine-tune.",
	)
	parser.add_argument(
	"--probe_run", action="store_true", help="If set, runs a short probe run."
	)

	cli_args = parser.parse_args()
	main(cli_args)