opinder2906 commited on
Commit
471fbb4
·
verified ·
1 Parent(s): 2bace3f

Create train.py

Browse files
Files changed (1) hide show
  1. src/train.py +73 -0
src/train.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import torch
3
+ import numpy as np
4
+ from sklearn.preprocessing import LabelEncoder
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from sklearn.model_selection import train_test_split
7
+ from model_def import EmotionTransformer
8
+ import joblib
9
+
10
+ # 1) Load & clean data (with your Google Drive links)
11
+
12
+ def load_and_preprocess():
13
+ df = pd.read_csv(
14
+ "https://drive.google.com/uc?export=download&id=14D_HcvTFL63-KffCQLNFxGH-oY_knwmo",
15
+ delimiter=';', header=None, names=['sentence','label']
16
+ )
17
+ ts_df = pd.read_csv(
18
+ "https://drive.google.com/uc?export=download&id=1Vmr1Rfv4pLSlAUrlOCxAcszvlxJOSHrm",
19
+ delimiter=';', header=None, names=['sentence','label']
20
+ )
21
+ df = pd.concat([df, ts_df], ignore_index=True)
22
+ df.drop_duplicates(inplace=True)
23
+ df['clean'] = df['sentence'].apply(clean_text)
24
+ return df
25
+
26
+ # 2) Build vocab, encode & pad
27
+ from collections import Counter
28
+ MAX_LEN=32
29
+
30
+ def build_vocab(tokenized):
31
+ counter = Counter([t for sent in tokenized for t in sent])
32
+ vocab = {w:i+2 for i,(w,_) in enumerate(counter.most_common())}
33
+ vocab['<PAD>']=0; vocab['<UNK>']=1
34
+ return vocab
35
+
36
+ # Dataset class
37
+ def class EmotionDataset(Dataset):
38
+ def __init__(self, X, y):
39
+ self.X=torch.tensor(X,dtype=torch.long)
40
+ self.y=torch.tensor(y,dtype=torch.long)
41
+ def __len__(self): return len(self.X)
42
+ def __getitem__(self,idx): return self.X[idx],self.y[idx]
43
+
44
+ # 3) Training pipeline
45
+ def train():
46
+ df=load_and_preprocess()
47
+ tokenized = df['clean'].str.split()
48
+ vocab=build_vocab(tokenized)
49
+ X = [([vocab.get(t,vocab['<UNK>']) for t in s] + [vocab['<PAD>']]*max(0,MAX_LEN-len(s)))[:MAX_LEN]
50
+ for s in tokenized]
51
+ le=LabelEncoder(); y=le.fit_transform(df['label'])
52
+ joblib.dump(le,'label_encoder.pkl'); joblib.dump(vocab,'vocab.pkl')
53
+
54
+ X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
55
+ train_loader=DataLoader(EmotionDataset(X_train,y_train),batch_size=16,shuffle=True)
56
+ val_loader=DataLoader(EmotionDataset(X_val ,y_val),batch_size=16)
57
+
58
+ device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
59
+ model=EmotionTransformer(len(vocab),num_classes=len(le.classes_)).to(device)
60
+ opt=torch.optim.Adam(model.parameters(),lr=1e-3)
61
+ crit=torch.nn.CrossEntropyLoss()
62
+
63
+ for epoch in range(5):
64
+ model.train(); total_loss=0
65
+ for xb,yb in train_loader:
66
+ xb,yb=xb.to(device),yb.to(device)
67
+ opt.zero_grad(); out=model(xb)
68
+ loss=crit(out,yb); loss.backward(); opt.step(); total_loss+=loss.item()
69
+ print(f"Epoch {epoch+1} loss {total_loss/len(train_loader):.4f}")
70
+ torch.save(model.state_dict(),'emotion_transformer_model.pth')
71
+
72
+ if __name__=='__main__':
73
+ train()