Spaces:
Sleeping
Sleeping
Create train.py
Browse files- src/train.py +73 -0
src/train.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.preprocessing import LabelEncoder
|
| 5 |
+
from torch.utils.data import Dataset, DataLoader
|
| 6 |
+
from sklearn.model_selection import train_test_split
|
| 7 |
+
from model_def import EmotionTransformer
|
| 8 |
+
import joblib
|
| 9 |
+
|
| 10 |
+
# 1) Load & clean data (with your Google Drive links)
|
| 11 |
+
|
| 12 |
+
def load_and_preprocess():
|
| 13 |
+
df = pd.read_csv(
|
| 14 |
+
"https://drive.google.com/uc?export=download&id=14D_HcvTFL63-KffCQLNFxGH-oY_knwmo",
|
| 15 |
+
delimiter=';', header=None, names=['sentence','label']
|
| 16 |
+
)
|
| 17 |
+
ts_df = pd.read_csv(
|
| 18 |
+
"https://drive.google.com/uc?export=download&id=1Vmr1Rfv4pLSlAUrlOCxAcszvlxJOSHrm",
|
| 19 |
+
delimiter=';', header=None, names=['sentence','label']
|
| 20 |
+
)
|
| 21 |
+
df = pd.concat([df, ts_df], ignore_index=True)
|
| 22 |
+
df.drop_duplicates(inplace=True)
|
| 23 |
+
df['clean'] = df['sentence'].apply(clean_text)
|
| 24 |
+
return df
|
| 25 |
+
|
| 26 |
+
# 2) Build vocab, encode & pad
|
| 27 |
+
from collections import Counter
|
| 28 |
+
MAX_LEN=32
|
| 29 |
+
|
| 30 |
+
def build_vocab(tokenized):
|
| 31 |
+
counter = Counter([t for sent in tokenized for t in sent])
|
| 32 |
+
vocab = {w:i+2 for i,(w,_) in enumerate(counter.most_common())}
|
| 33 |
+
vocab['<PAD>']=0; vocab['<UNK>']=1
|
| 34 |
+
return vocab
|
| 35 |
+
|
| 36 |
+
# Dataset class
|
| 37 |
+
def class EmotionDataset(Dataset):
|
| 38 |
+
def __init__(self, X, y):
|
| 39 |
+
self.X=torch.tensor(X,dtype=torch.long)
|
| 40 |
+
self.y=torch.tensor(y,dtype=torch.long)
|
| 41 |
+
def __len__(self): return len(self.X)
|
| 42 |
+
def __getitem__(self,idx): return self.X[idx],self.y[idx]
|
| 43 |
+
|
| 44 |
+
# 3) Training pipeline
|
| 45 |
+
def train():
|
| 46 |
+
df=load_and_preprocess()
|
| 47 |
+
tokenized = df['clean'].str.split()
|
| 48 |
+
vocab=build_vocab(tokenized)
|
| 49 |
+
X = [([vocab.get(t,vocab['<UNK>']) for t in s] + [vocab['<PAD>']]*max(0,MAX_LEN-len(s)))[:MAX_LEN]
|
| 50 |
+
for s in tokenized]
|
| 51 |
+
le=LabelEncoder(); y=le.fit_transform(df['label'])
|
| 52 |
+
joblib.dump(le,'label_encoder.pkl'); joblib.dump(vocab,'vocab.pkl')
|
| 53 |
+
|
| 54 |
+
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
|
| 55 |
+
train_loader=DataLoader(EmotionDataset(X_train,y_train),batch_size=16,shuffle=True)
|
| 56 |
+
val_loader=DataLoader(EmotionDataset(X_val ,y_val),batch_size=16)
|
| 57 |
+
|
| 58 |
+
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 59 |
+
model=EmotionTransformer(len(vocab),num_classes=len(le.classes_)).to(device)
|
| 60 |
+
opt=torch.optim.Adam(model.parameters(),lr=1e-3)
|
| 61 |
+
crit=torch.nn.CrossEntropyLoss()
|
| 62 |
+
|
| 63 |
+
for epoch in range(5):
|
| 64 |
+
model.train(); total_loss=0
|
| 65 |
+
for xb,yb in train_loader:
|
| 66 |
+
xb,yb=xb.to(device),yb.to(device)
|
| 67 |
+
opt.zero_grad(); out=model(xb)
|
| 68 |
+
loss=crit(out,yb); loss.backward(); opt.step(); total_loss+=loss.item()
|
| 69 |
+
print(f"Epoch {epoch+1} loss {total_loss/len(train_loader):.4f}")
|
| 70 |
+
torch.save(model.state_dict(),'emotion_transformer_model.pth')
|
| 71 |
+
|
| 72 |
+
if __name__=='__main__':
|
| 73 |
+
train()
|