plutchikk / scripts /training_starter.py

Production-grade Plutchik ERC Benchmark with Nemotron-3 Integration

0c5db65 verified about 1 month ago

1.55 kB

	import torch
	from transformers import AutoTokenizer, AutoModel

	# 1. Setup - Using a standard BERT model
	model_name = "bert-base-uncased"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	# 2. Sample Data (Representing your Plutchik dataset)
	sentences = [
	"I am so incredibly happy with this result!", # Joy
	"I can't believe you would betray my trust like that.", # Disgust/Anger
	]

	print(f"--- Processing {len(sentences)} sentences ---")

	# 3. Tokenization Step
	# This converts text to 'input_ids' (numbers) and 'attention_mask' (1s and 0s)
	inputs = tokenizer(
	sentences,
	padding=True,
	truncation=True,
	max_length=12,
	return_tensors="pt" # Return PyTorch tensors
	)

	print("\n[Token IDs]:")
	print(inputs["input_ids"])

	# 4. Embedding Step (The 'Forward Pass')
	# We pass the tokens into the BERT model to get the vectors
	with torch.no_grad():
	outputs = model(**inputs)

	# The 'last_hidden_state' contains the embeddings for every token
	# Shape: [Batch_Size, Sequence_Length, Hidden_Dimension]
	embeddings = outputs.last_hidden_state

	print("\n[Embedding Shape]:")
	print(embeddings.shape)
	# Result will be [2, 12, 768] -> 2 sentences, 12 tokens each, 768 dimensions per token

	# 5. Sentence-Level Embedding (CLS Token)
	# Usually, for Emotion Classification, we just take the first token ([CLS])
	# as the summary of the whole sentence.
	sentence_vectors = embeddings[:, 0, :]
	print("\n[Sentence Vector for Sentence 1 (First 5 values)]:")
	print(sentence_vectors[0][:5])