plutchikk / scripts /training_starter.py
Krsnapriya's picture
Production-grade Plutchik ERC Benchmark with Nemotron-3 Integration
0c5db65 verified
import torch
from transformers import AutoTokenizer, AutoModel
# 1. Setup - Using a standard BERT model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
# 2. Sample Data (Representing your Plutchik dataset)
sentences = [
"I am so incredibly happy with this result!", # Joy
"I can't believe you would betray my trust like that.", # Disgust/Anger
]
print(f"--- Processing {len(sentences)} sentences ---")
# 3. Tokenization Step
# This converts text to 'input_ids' (numbers) and 'attention_mask' (1s and 0s)
inputs = tokenizer(
sentences,
padding=True,
truncation=True,
max_length=12,
return_tensors="pt" # Return PyTorch tensors
)
print("\n[Token IDs]:")
print(inputs["input_ids"])
# 4. Embedding Step (The 'Forward Pass')
# We pass the tokens into the BERT model to get the vectors
with torch.no_grad():
outputs = model(**inputs)
# The 'last_hidden_state' contains the embeddings for every token
# Shape: [Batch_Size, Sequence_Length, Hidden_Dimension]
embeddings = outputs.last_hidden_state
print("\n[Embedding Shape]:")
print(embeddings.shape)
# Result will be [2, 12, 768] -> 2 sentences, 12 tokens each, 768 dimensions per token
# 5. Sentence-Level Embedding (CLS Token)
# Usually, for Emotion Classification, we just take the first token ([CLS])
# as the summary of the whole sentence.
sentence_vectors = embeddings[:, 0, :]
print("\n[Sentence Vector for Sentence 1 (First 5 values)]:")
print(sentence_vectors[0][:5])