Spaces:

Obb12
/

Japanese_sentiment

Running

App Files Files Community

Japanese_sentiment / train.py

Obb12

Upload 8 files

c1cfbf2 verified 1 day ago

raw

history blame contribute delete

4.01 kB

	import torch
	import torch.nn as nn
	from torch.utils.data import Dataset, DataLoader
	import pandas as pd
	from janome.tokenizer import Tokenizer
	from sklearn.model_selection import train_test_split

	# =====================
	# Settings
	# =====================

	MAX_LEN = 20
	BATCH_SIZE = 32
	EMBED_SIZE = 64
	EPOCHS = 100
	LR = 0.05

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# =====================
	# Tokenizer
	# =====================

	tokenizer = Tokenizer()

	def tokenize(text):
	return [token.surface for token in tokenizer.tokenize(text)]

	# =====================
	# Load dataset
	# =====================

	train_df = pd.read_csv("japanese_sentiment_train.csv")
	test_df = pd.read_csv("japanese_sentiment_test.csv") # separate test set

	train_texts = train_df["text"].tolist()
	train_labels = train_df["label"].tolist()

	test_texts = test_df["text"].tolist()
	test_labels = test_df["label"].tolist()
	# =====================
	# Build vocabulary
	# =====================

	vocab = {"<PAD>": 0, "<UNK>": 1}

	for text in texts:
	for token in tokenize(text):
	if token not in vocab:
	vocab[token] = len(vocab)

	vocab_size = len(vocab)

	print("Vocab size:", vocab_size)

	# =====================
	# Convert text to tensor
	# =====================

	def encode(text):
	tokens = tokenize(text)
	ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]

	# padding
	if len(ids) < MAX_LEN:
	ids += [0] * (MAX_LEN - len(ids))
	else:
	ids = ids[:MAX_LEN]

	return ids

	# =====================
	# Dataset class
	# =====================

	class JapaneseDataset(Dataset):
	def __init__(self, texts, labels):
	self.texts = texts
	self.labels = labels

	def __len__(self):
	return len(self.texts)

	def __getitem__(self, idx):
	x = torch.tensor(encode(self.texts[idx]), dtype=torch.long)
	y = torch.tensor(self.labels[idx], dtype=torch.float32)
	return x, y

	# =====================
	# Train/test split
	# =====================

	train_dataset = JapaneseDataset(train_texts, train_labels)
	test_dataset = JapaneseDataset(test_texts, test_labels)

	train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
	test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
	# =====================
	# Model
	# =====================

	class SentimentModel(nn.Module):

	def __init__(self, vocab_size):
	super().__init__()

	self.embedding = nn.Embedding(vocab_size, EMBED_SIZE)

	self.fc = nn.Sequential(
	nn.Linear(EMBED_SIZE, 32),
	nn.ReLU(),
	nn.Linear(32, 1),

	nn.Sigmoid()
	)

	def forward(self, x):

	x = self.embedding(x)

	x = x.mean(dim=1)

	x = self.fc(x)

	return x.squeeze()

	model = SentimentModel(vocab_size).to(device)

	# =====================
	# Loss and optimizer
	# =====================

	criterion = nn.BCELoss()
	optimizer = torch.optim.Adam(model.parameters(), lr=LR)

	# =====================
	# Training loop
	# =====================

	for epoch in range(EPOCHS):

	model.train()
	total_loss = 0

	for x, y in train_loader:

	x, y = x.to(device), y.to(device)

	outputs = model(x)

	loss = criterion(outputs, y)

	optimizer.zero_grad()
	loss.backward()
	optimizer.step()

	total_loss += loss.item()

	print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

	# =====================
	# Evaluation
	# =====================

	model.eval()

	correct = 0
	total = 0

	with torch.no_grad():

	for x, y in test_loader:

	x, y = x.to(device), y.to(device)

	outputs = model(x)

	predicted = (outputs > 0.5).float()

	correct += (predicted == y).sum().item()
	total += y.size(0)

	accuracy = correct / total

	print("Accuracy:", accuracy)



	torch.save({
	"model_state_dict": model.state_dict(),
	"vocab": vocab
	}, "japanese_sentiment_model.pth")

	print("Model saved successfully.")