Step 1: Import Modules

In [None]:
!pip install transformers datasets accelerate evaluate scikit-learn torch
!pip install sentencepiece # for tokenizer support


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

Step 2: Import Datasets

In [None]:
!pip install datasets



In [None]:
from google.colab import files
uploaded = files.upload()  # Upload the downloaded .parquet file here


Saving train-00000-of-00001.parquet to train-00000-of-00001.parquet


In [None]:
import pandas as pd

# Load the uploaded Parquet file
df = pd.read_parquet("train-00000-of-00001.parquet")

# Show all available column names
print(df.columns)


Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')


Step 3: Dataset Formating and defining targets

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the uploaded Parquet file
df = pd.read_parquet("train-00000-of-00001.parquet")

# List of all emotion columns
emotion_columns = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

# Extract active emotions (where value == 1)
df["emotions"] = df[emotion_columns].apply(
    lambda row: [emotion for emotion in emotion_columns if row[emotion] == 1],
    axis=1
)

# Emotion to sentiment mapping
emotion_to_sentiment = {
    'admiration': 'positive', 'amusement': 'positive', 'approval': 'positive',
    'caring': 'positive', 'desire': 'positive', 'excitement': 'positive',
    'gratitude': 'positive', 'joy': 'positive', 'love': 'positive',
    'optimism': 'positive', 'pride': 'positive', 'relief': 'positive',

    'anger': 'negative', 'annoyance': 'negative', 'disapproval': 'negative',
    'disgust': 'negative', 'embarrassment': 'negative', 'fear': 'negative',
    'grief': 'negative', 'nervousness': 'negative', 'remorse': 'negative',
    'disappointment': 'negative', 'sadness': 'negative',

    'confusion': 'neutral', 'curiosity': 'neutral', 'realization': 'neutral',
    'surprise': 'neutral', 'neutral': 'neutral'
}

# Map to sentiment category
df["sentiment"] = df["emotions"].apply(
    lambda emotions: next((emotion_to_sentiment[e] for e in emotions if e in emotion_to_sentiment), "neutral")
)

# Assign stress/anxiety scores
df["stress"] = df["emotions"].apply(lambda x: 1.0 if "fear" in x or "nervousness" in x else 0.2)
df["anxiety"] = df["emotions"].apply(lambda x: 1.0 if "nervousness" in x else 0.1)

# Encode sentiment labels (0 = neg, 1 = neutral, 2 = pos)
label_encoder = LabelEncoder()
df["sentiment_label"] = label_encoder.fit_transform(df["sentiment"])

# Final dataset
final_df = df[["text", "sentiment_label", "stress", "anxiety"]]
print("✅ Final dataset prepared with", len(final_df), "entries.")
final_df.head(10)


✅ Final dataset prepared with 211225 entries.


Unnamed: 0,text,sentiment_label,stress,anxiety
0,That game hurt.,0,0.2,0.1
1,>sexuality shouldn’t be a grouping category I...,1,0.2,0.1
2,"You do right, if you don't care then fuck 'em!",1,0.2,0.1
3,Man I love reddit.,2,0.2,0.1
4,"[NAME] was nowhere near them, he was by the Fa...",1,0.2,0.1
5,Right? Considering it’s such an important docu...,2,0.2,0.1
6,"He isn't as big, but he's still quite popular....",0,0.2,0.1
7,That's crazy; I went to a super [RELIGION] hig...,2,0.2,0.1
8,that's adorable asf,2,0.2,0.1
9,"""Sponge Blurb Pubs Quaw Haha GURR ha AAa!"" fin...",2,0.2,0.1


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the uploaded Parquet file
df = pd.read_parquet("train-00000-of-00001.parquet")

# List of all emotion columns
emotion_columns = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

# Extract active emotions
df["emotions"] = df[emotion_columns].apply(
    lambda row: [emotion for emotion in emotion_columns if row[emotion] == 1],
    axis=1
)

# Map to sentiment
emotion_to_sentiment = {
    'admiration': 'positive', 'amusement': 'positive', 'approval': 'positive',
    'caring': 'positive', 'desire': 'positive', 'excitement': 'positive',
    'gratitude': 'positive', 'joy': 'positive', 'love': 'positive',
    'optimism': 'positive', 'pride': 'positive', 'relief': 'positive',

    'anger': 'negative', 'annoyance': 'negative', 'disapproval': 'negative',
    'disgust': 'negative', 'embarrassment': 'negative', 'fear': 'negative',
    'grief': 'negative', 'nervousness': 'negative', 'remorse': 'negative',
    'disappointment': 'negative', 'sadness': 'negative',

    'confusion': 'neutral', 'curiosity': 'neutral', 'realization': 'neutral',
    'surprise': 'neutral', 'neutral': 'neutral'
}

df["sentiment"] = df["emotions"].apply(
    lambda emotions: next((emotion_to_sentiment[e] for e in emotions if e in emotion_to_sentiment), "neutral")
)

# Expanded emotion sets
high_stress_emotions = {
    "fear", "nervousness", "grief", "sadness", "embarrassment", "disgust", "remorse", "anger", "disappointment"
}
high_anxiety_emotions = {
    "nervousness", "fear", "embarrassment", "grief", "remorse", "sadness"
}

# Smarter scoring
df["stress"] = df["emotions"].apply(
    lambda x: 1.0 if any(e in high_stress_emotions for e in x) else 0.2
)
df["anxiety"] = df["emotions"].apply(
    lambda x: 1.0 if any(e in high_anxiety_emotions for e in x) else 0.1
)

# Encode sentiment: 0 = negative, 1 = neutral, 2 = positive
label_encoder = LabelEncoder()
df["sentiment_label"] = label_encoder.fit_transform(df["sentiment"])

# Final dataset
final_df = df[["text", "sentiment_label", "stress", "anxiety"]]
print("✅ Final dataset prepared with", len(final_df), "entries.")
final_df.head(10)


✅ Final dataset prepared with 211225 entries.


Unnamed: 0,text,sentiment_label,stress,anxiety
0,That game hurt.,0,1.0,1.0
1,>sexuality shouldn’t be a grouping category I...,1,0.2,0.1
2,"You do right, if you don't care then fuck 'em!",1,0.2,0.1
3,Man I love reddit.,2,0.2,0.1
4,"[NAME] was nowhere near them, he was by the Fa...",1,0.2,0.1
5,Right? Considering it’s such an important docu...,2,0.2,0.1
6,"He isn't as big, but he's still quite popular....",0,0.2,0.1
7,That's crazy; I went to a super [RELIGION] hig...,2,0.2,0.1
8,that's adorable asf,2,0.2,0.1
9,"""Sponge Blurb Pubs Quaw Haha GURR ha AAa!"" fin...",2,0.2,0.1


Step 4: Main Model

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class MultiTaskSentimentStressModel(nn.Module):
    def __init__(self, model_name="distilbert-base-uncased", num_sentiment_classes=3):
        super(MultiTaskSentimentStressModel, self).__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.hidden_size = self.base_model.config.hidden_size

        # Heads
        self.dropout = nn.Dropout(0.3)
        self.sentiment_classifier = nn.Linear(self.hidden_size, num_sentiment_classes)
        self.stress_regressor = nn.Linear(self.hidden_size, 1)
        self.anxiety_regressor = nn.Linear(self.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = self.dropout(output.last_hidden_state[:, 0, :])  # Use [CLS] token output

        sentiment_logits = self.sentiment_classifier(cls_output)
        stress_score = torch.sigmoid(self.stress_regressor(cls_output))
        anxiety_score = torch.sigmoid(self.anxiety_regressor(cls_output))

        return sentiment_logits, stress_score, anxiety_score

# 🔧 Initialize tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = MultiTaskSentimentStressModel(model_name)

# ✅ Quick test with dummy input
text = "I feel really anxious and overwhelmed today."
tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
    sentiment_logits, stress_pred, anxiety_pred = model(**tokens)

print("Sentiment:", torch.argmax(sentiment_logits).item())
print("Stress Score:", stress_pred.item())
print("Anxiety Score:", anxiety_pred.item())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Sentiment: 2
Stress Score: 0.3642602562904358
Anxiety Score: 0.4117843210697174


Step 5: Prepare Dataset for training

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch

# Step 5.1 – Split dataset
train_df, temp_df = train_test_split(final_df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Step 5.2 – Create custom PyTorch Dataset
class MultiTaskDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.loc[idx, "text"]
        inputs = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
        item = {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "sentiment_label": torch.tensor(self.data.loc[idx, "sentiment_label"], dtype=torch.long),
            "stress": torch.tensor(self.data.loc[idx, "stress"], dtype=torch.float),
            "anxiety": torch.tensor(self.data.loc[idx, "anxiety"], dtype=torch.float),
        }
        return item

# Step 5.3 – Create datasets
train_dataset = MultiTaskDataset(train_df, tokenizer)
val_dataset = MultiTaskDataset(val_df, tokenizer)
test_dataset = MultiTaskDataset(test_df, tokenizer)

# Step 5.4 – Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

print(f"✅ DataLoaders ready — Train: {len(train_loader)} batches, Val: {len(val_loader)}, Test: {len(test_loader)}")


✅ DataLoaders ready — Train: 10562 batches, Val: 1321, Test: 1321


Step 6: Model Training

In [None]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from tqdm import tqdm

# Step 6.0 – Subsample training and validation data
train_df_small = train_df.sample(n=5000, random_state=42).reset_index(drop=True)
val_df_small = val_df.sample(n=1000, random_state=42).reset_index(drop=True)

train_dataset = MultiTaskDataset(train_df_small, tokenizer)
val_dataset = MultiTaskDataset(val_df_small, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Step 6.1 – Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Step 6.2 – Define loss functions
ce_loss = nn.CrossEntropyLoss()
mse_loss = nn.MSELoss()

# Step 6.3 – Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Step 6.4 – Training function (LIMITED batches)
def train_epoch(model, dataloader, max_batches=100):
    model.train()
    total_loss = 0
    for i, batch in enumerate(tqdm(dataloader, desc="Training")):
        if i >= max_batches:
            break
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        sentiment = batch["sentiment_label"].to(device)
        stress = batch["stress"].to(device)
        anxiety = batch["anxiety"].to(device)

        optimizer.zero_grad()
        sentiment_logits, stress_pred, anxiety_pred = model(input_ids, attention_mask)

        loss_sentiment = ce_loss(sentiment_logits, sentiment)
        loss_stress = mse_loss(stress_pred.squeeze(), stress)
        loss_anxiety = mse_loss(anxiety_pred.squeeze(), anxiety)

        loss = loss_sentiment + loss_stress + loss_anxiety
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / max_batches

# Step 6.5 – Evaluation function (LIMITED batches)
def eval_epoch(model, dataloader, max_batches=100):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader, desc="Evaluating")):
            if i >= max_batches:
                break
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            sentiment = batch["sentiment_label"].to(device)
            stress = batch["stress"].to(device)
            anxiety = batch["anxiety"].to(device)

            sentiment_logits, stress_pred, anxiety_pred = model(input_ids, attention_mask)

            loss_sentiment = ce_loss(sentiment_logits, sentiment)
            loss_stress = mse_loss(stress_pred.squeeze(), stress)
            loss_anxiety = mse_loss(anxiety_pred.squeeze(), anxiety)

            loss = loss_sentiment + loss_stress + loss_anxiety
            total_loss += loss.item()
    return total_loss / max_batches

# Step 6.6 – Run training
EPOCHS = 2
for epoch in range(EPOCHS):
    print(f"\n🔁 Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_epoch(model, train_loader, max_batches=100)
    val_loss = eval_epoch(model, val_loader, max_batches=100)
    print(f"✅ Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")



🔁 Epoch 1/2


Training:  32%|███▏      | 100/313 [12:24<26:25,  7.44s/it]
Evaluating: 100%|██████████| 63/63 [02:17<00:00,  2.18s/it]


✅ Train Loss: 1.1532, Val Loss: 0.6480

🔁 Epoch 2/2


Training:  32%|███▏      | 100/313 [11:58<25:30,  7.18s/it]
Evaluating: 100%|██████████| 63/63 [02:18<00:00,  2.19s/it]

✅ Train Loss: 0.9415, Val Loss: 0.6263





Step 7: Save, predict and evaluate

In [None]:
import torch
import os
from sklearn.metrics import accuracy_score, mean_absolute_error

# Step 7.1 – Save model and tokenizer
save_dir = "multi_task_model_v1"
os.makedirs(save_dir, exist_ok=True)
torch.save(model.state_dict(), os.path.join(save_dir, "pytorch_model.bin"))
tokenizer.save_pretrained(save_dir)
print("✅ Model and tokenizer saved to", save_dir)

# Step 7.2 – Use only first N batches for prediction
N_BATCHES = 50  # You can reduce/increase as needed

model.eval()
true_sentiments, pred_sentiments = [], []
true_stress, pred_stress = [], []
true_anxiety, pred_anxiety = [], []

with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader, desc="📊 Predicting on test set")):
        if i >= N_BATCHES:
            break
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        sentiment = batch["sentiment_label"].to(device)
        stress = batch["stress"].to(device)
        anxiety = batch["anxiety"].to(device)

        sentiment_logits, stress_pred, anxiety_pred = model(input_ids, attention_mask)

        true_sentiments.extend(sentiment.cpu().tolist())
        pred_sentiments.extend(torch.argmax(sentiment_logits, dim=1).cpu().tolist())
        true_stress.extend(stress.cpu().tolist())
        pred_stress.extend(stress_pred.squeeze().cpu().tolist())
        true_anxiety.extend(anxiety.cpu().tolist())
        pred_anxiety.extend(anxiety_pred.squeeze().cpu().tolist())

# Step 7.3 – Evaluation
acc = accuracy_score(true_sentiments, pred_sentiments)
mae_stress = mean_absolute_error(true_stress, pred_stress)
mae_anxiety = mean_absolute_error(true_anxiety, pred_anxiety)

print(f"\n📊 Evaluation on {len(true_sentiments)} test samples:")
print(f"✅ Sentiment Accuracy: {acc:.4f}")
print(f"✅ Stress MAE: {mae_stress:.4f}")
print(f"✅ Anxiety MAE: {mae_anxiety:.4f}")


✅ Model and tokenizer saved to multi_task_model_v1


📊 Predicting on test set:   4%|▍         | 50/1321 [01:50<46:37,  2.20s/it]


📊 Evaluation on 800 test samples:
✅ Sentiment Accuracy: 0.5900
✅ Stress MAE: 0.2076
✅ Anxiety MAE: 0.1460





Save to Hugging Face

In [None]:
# ✅ Install dependencies
!pip install -q huggingface_hub transformers

# ✅ Login to Hugging Face Hub
from huggingface_hub import login
login(token="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")  # ⬅️ Replace with your real token


In [None]:
# ✅ Step 1: Manually save model and config
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
import json

save_dir = "multi_task_model_v1"
os.makedirs(save_dir, exist_ok=True)

# 🧠 Your fine-tuned model (assumed already defined)
# model = ...  # should already exist in your code

# ✅ 1. Save model weights
torch.save(model.state_dict(), f"{save_dir}/pytorch_model.bin")

# ✅ 2. Save config manually
config = AutoConfig.from_pretrained("distilbert-base-uncased", num_labels=3)
config.save_pretrained(save_dir)

# ✅ 3. Save tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.save_pretrained(save_dir)

# ✅ 4. Reload and push
model = AutoModelForSequenceClassification.from_pretrained(save_dir, config=config)
model.push_to_hub("Sohan2004/TextSentimentClassifierV1")
tokenizer.push_to_hub("Sohan2004/TextSentimentClassifierV1")

print("✅ Model and tokenizer pushed to Hugging Face at https://huggingface.co/Sohan2004/TextSentimentClassifierV1")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at multi_task_model_v1 and are newly initialized: ['classifier.bias', 'classifier.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.word_embeddings.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin2.bias', 'distilbert.transformer.layer.

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

✅ Model and tokenizer pushed to Hugging Face at https://huggingface.co/Sohan2004/TextSentimentClassifierV1


Further Training

In [None]:
# ✅ Fine-tune Sohan2004/TextSentimentClassifierV1 on diary-style emotion-labeled data

!pip install -q transformers datasets huggingface_hub


In [None]:

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from huggingface_hub import login
from google.colab import files

# ✅ Upload the CSV file (should contain 'text' and 'emotions' columns)
uploaded = files.upload()
df = pd.read_csv("journal_entries.csv")  # replace if filename differs

# ✅ Map emotions to sentiment labels (simplified for finetuning)
emotion_to_sentiment = {
    'joy': 'positive', 'love': 'positive', 'gratitude': 'positive', 'relief': 'positive',
    'anger': 'negative', 'sadness': 'negative', 'fear': 'negative', 'disgust': 'negative',
    'neutral': 'neutral'
}

def map_emotions(e):
    if isinstance(e, str):
        for emo in emotion_to_sentiment:
            if emo in e.lower():
                return emotion_to_sentiment[emo]
    return "neutral"

df["sentiment"] = df["emotions"].apply(map_emotions)
label_map = {"negative": 0, "neutral": 1, "positive": 2}
df["label"] = df["sentiment"].map(label_map)
dataset = Dataset.from_pandas(df[["text", "label"]])

# ✅ Tokenizer & preprocessing
model_id = "Sohan2004/TextSentimentClassifierV1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

def preprocess(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(preprocess, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# ✅ Load model
model = AutoModelForSequenceClassification.from_pretrained(model_id)

# ✅ Train/Test split
split = tokenized_dataset.train_test_split(test_size=0.2)
train_data = split["train"]
eval_data = split["test"]

# ✅ Training setup
args = TrainingArguments(
    output_dir="./finetuned_TextSentimentClassifierV1",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer
)

# ✅ Train
trainer.train()

# ✅ Save and Push
model.save_pretrained("finetuned_TextSentimentClassifierV1")
tokenizer.save_pretrained("finetuned_TextSentimentClassifierV1")

# ✅ Upload to Hugging Face
login()  # Paste your token when prompted
model.push_to_hub("Sohan2004/TextSentimentClassifierV1")
tokenizer.push_to_hub("Sohan2004/TextSentimentClassifierV1")


Test Case

In [None]:
import torch
from transformers import AutoTokenizer

# Load tokenizer and model if not already loaded
tokenizer = AutoTokenizer.from_pretrained("multi_task_model_v1")

# Re-define model class (same as Step 4)
class MultiTaskSentimentStressModel(nn.Module):
    def __init__(self, model_name="distilbert-base-uncased", num_sentiment_classes=3):
        super(MultiTaskSentimentStressModel, self).__init__()
        from transformers import AutoModel
        self.base_model = AutoModel.from_pretrained(model_name)
        self.hidden_size = self.base_model.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        self.sentiment_classifier = nn.Linear(self.hidden_size, num_sentiment_classes)
        self.stress_regressor = nn.Linear(self.hidden_size, 1)
        self.anxiety_regressor = nn.Linear(self.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = self.dropout(output.last_hidden_state[:, 0, :])
        sentiment_logits = self.sentiment_classifier(cls_output)
        stress_score = torch.sigmoid(self.stress_regressor(cls_output))
        anxiety_score = torch.sigmoid(self.anxiety_regressor(cls_output))
        return sentiment_logits, stress_score, anxiety_score

# Load the trained model
model = MultiTaskSentimentStressModel()
model.load_state_dict(torch.load("multi_task_model_v1/pytorch_model.bin", map_location=torch.device("cpu")))
model.eval()

# 🧪 Function to test custom diary input
def predict_diary_entry(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        sentiment_logits, stress_pred, anxiety_pred = model(**inputs)

    sentiment_index = torch.argmax(sentiment_logits, dim=1).item()
    stress = round(stress_pred.item(), 3)
    anxiety = round(anxiety_pred.item(), 3)

    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    sentiment = label_map.get(sentiment_index, "Unknown")

    print("\n📝 Input:", text)
    print("🔎 Sentiment:", sentiment)
    print("📉 Stress Score:", stress)
    print("😟 Anxiety Score:", anxiety)

# 🧾 EXAMPLE: Try your own diary entry below
predict_diary_entry("I am happy")



📝 Input: I am happy
🔎 Sentiment: Positive
📉 Stress Score: 0.314
😟 Anxiety Score: 0.178
