triplet-embed / train_wandb.py

akhilreddygogula

gitignore updated

d65bc08 6 months ago

6.23 kB

	import os
	import json
	import random
	import logging

	import wandb
	from sentence_transformers import SentenceTransformer, InputExample, evaluation
	from sentence_transformers.losses import TripletLoss
	from torch.utils.data import DataLoader

	from rich.console import Console
	from rich.table import Table
	from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn

	# ——————— Setup logging & console ———————
	logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
	console = Console()

	# ——————— Data loading & splitting ———————
	def load_triplets_from_jsonl(path):
	examples = []
	with open(path, 'r', encoding='utf-8') as f:
	for line in f:
	data = json.loads(line)
	examples.append(InputExample(texts=[
	data['anchor'], data['positive'], data['negative']
	]))
	return examples

	def split_data(examples, train_ratio=0.8, seed=42):
	random.seed(seed)
	random.shuffle(examples)
	split = int(len(examples) * train_ratio)
	return examples[:split], examples[split:]

	# ——————— Custom loss to log training loss to W&B ———————
	class WandbLoggingTripletLoss(TripletLoss):
	def __init__(self, model, progress_task, progress):
	super().__init__(model)
	self.progress_task = progress_task
	self.progress = progress
	self.step = 0

	# HuggingFace Trainer will call forward(sentence_features, labels)
	def forward(self, sentence_features, labels):
	loss = super().forward(sentence_features, labels)
	wandb.log({"train_step": self.step, "train_loss": loss.item()})
	self.progress.update(self.progress_task, advance=1)
	self.step += 1
	return loss

	# ——————— Custom evaluator to log validation metrics to W&B ———————
	class RichLossEvaluator(evaluation.TripletEvaluator):
	def __init__(self, anchors, positives, negatives, name, progress_task, progress):
	super().__init__(
	anchors=anchors, positives=positives, negatives=negatives,
	name=name, show_progress_bar=False
	)
	self.progress_task = progress_task
	self.progress = progress

	def __call__(self, model, output_path=None, epoch=-1, steps=-1, **kwargs):
	metrics = super().__call__(model, output_path=output_path, epoch=epoch, steps=steps, **kwargs)
	# extract a scalar (accuracy or first metric)
	if isinstance(metrics, dict):
	val_score = metrics.get("accuracy", next(iter(metrics.values())))
	else:
	val_score = metrics
	wandb.log({
	"validation_step": steps,
	"validation_score": val_score,
	"epoch": epoch
	})
	# advance progress bar by 10 (since eval runs every 10 steps)
	self.progress.update(self.progress_task, advance=10)
	self.progress.console.log(f"[blue]Step {steps}[/blue]: Validation score: {val_score:.4f}")
	return metrics

	def main():
	# ——————— Configuration ———————
	model_name = 'all-MiniLM-L6-v2'
	jsonl_path = 'triplets.jsonl'
	output_path = 'fine_tuned_sbert_triplet'
	batch_size = 16
	train_ratio = 0.8
	num_epochs = 1

	# ——————— Initialize model & W&B ———————
	model = SentenceTransformer(model_name)
	wandb.init(
	project="sbert-triplet-finetune",
	name="custom_run_name", # distinct from output_path
	config={"model": model_name, "batch_size": batch_size, "epochs": num_epochs}
	)

	# ——————— Load & split data ———————
	examples = load_triplets_from_jsonl(jsonl_path)
	train_exs, val_exs = split_data(examples, train_ratio=train_ratio)

	train_loader = DataLoader(train_exs, shuffle=True, batch_size=batch_size)
	val_loader = DataLoader(val_exs, shuffle=False, batch_size=batch_size)

	anchors = [e.texts[0] for e in val_exs]
	positives = [e.texts[1] for e in val_exs]
	negatives = [e.texts[2] for e in val_exs]

	total_train_steps = len(train_loader) * num_epochs
	warmup_steps = int(0.1 * total_train_steps)

	console.print(f"[bold]Training on {len(train_exs)} triplets, validating on {len(val_exs)} triplets[/bold]\n")

	progress = Progress(
	SpinnerColumn(),
	TextColumn("[progress.description]{task.description}"),
	BarColumn(),
	TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
	TimeElapsedColumn(),
	transient=True,
	)

	with progress:
	task = progress.add_task("[green]Training...[/green]", total=total_train_steps)

	train_loss = WandbLoggingTripletLoss(model, task, progress)
	evaluator = RichLossEvaluator(anchors, positives, negatives, 'val-triplet', task, progress)

	model.fit(
	train_objectives=[(train_loader, train_loss)],
	evaluator=evaluator,
	epochs=num_epochs,
	warmup_steps=warmup_steps,
	evaluation_steps=10,
	output_path=output_path,
	show_progress_bar=False
	# <-- no `callback=[]` here
	)
	progress.update(task, completed=total_train_steps)

	# ——————— Save & finish W&B ———————
	model.save(output_path)
	wandb.save(os.path.join(output_path, "pytorch_model.bin"))
	wandb.finish()

	# ——————— Final summary ———————
	summary = Table(title="Training Summary")
	summary.add_column("Metric", style="cyan", no_wrap=True)
	summary.add_column("Value", style="magenta")

	summary.add_row("Model Name", model_name)
	summary.add_row("Total Triplets", str(len(examples)))
	summary.add_row("Training Triplets", str(len(train_exs)))
	summary.add_row("Validation Triplets", str(len(val_exs)))
	summary.add_row("Epochs", str(num_epochs))
	summary.add_row("Output Path", output_path)

	console.print(summary)
	console.print("[bold green]✅ Fine‑tuning complete and model saved![/bold green]")

	if __name__ == "__main__":
	main()