Spaces:
Running
Running
| import torch | |
| import torch.nn as nn | |
| from torch.utils.data import Dataset, DataLoader | |
| import pandas as pd | |
| from janome.tokenizer import Tokenizer | |
| from sklearn.model_selection import train_test_split | |
| # ===================== | |
| # Settings | |
| # ===================== | |
| MAX_LEN = 20 | |
| BATCH_SIZE = 32 | |
| EMBED_SIZE = 64 | |
| EPOCHS = 100 | |
| LR = 0.05 | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # ===================== | |
| # Tokenizer | |
| # ===================== | |
| tokenizer = Tokenizer() | |
| def tokenize(text): | |
| return [token.surface for token in tokenizer.tokenize(text)] | |
| # ===================== | |
| # Load dataset | |
| # ===================== | |
| train_df = pd.read_csv("japanese_sentiment_train.csv") | |
| test_df = pd.read_csv("japanese_sentiment_test.csv") # separate test set | |
| train_texts = train_df["text"].tolist() | |
| train_labels = train_df["label"].tolist() | |
| test_texts = test_df["text"].tolist() | |
| test_labels = test_df["label"].tolist() | |
| # ===================== | |
| # Build vocabulary | |
| # ===================== | |
| vocab = {"<PAD>": 0, "<UNK>": 1} | |
| for text in texts: | |
| for token in tokenize(text): | |
| if token not in vocab: | |
| vocab[token] = len(vocab) | |
| vocab_size = len(vocab) | |
| print("Vocab size:", vocab_size) | |
| # ===================== | |
| # Convert text to tensor | |
| # ===================== | |
| def encode(text): | |
| tokens = tokenize(text) | |
| ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens] | |
| # padding | |
| if len(ids) < MAX_LEN: | |
| ids += [0] * (MAX_LEN - len(ids)) | |
| else: | |
| ids = ids[:MAX_LEN] | |
| return ids | |
| # ===================== | |
| # Dataset class | |
| # ===================== | |
| class JapaneseDataset(Dataset): | |
| def __init__(self, texts, labels): | |
| self.texts = texts | |
| self.labels = labels | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, idx): | |
| x = torch.tensor(encode(self.texts[idx]), dtype=torch.long) | |
| y = torch.tensor(self.labels[idx], dtype=torch.float32) | |
| return x, y | |
| # ===================== | |
| # Train/test split | |
| # ===================== | |
| train_dataset = JapaneseDataset(train_texts, train_labels) | |
| test_dataset = JapaneseDataset(test_texts, test_labels) | |
| train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) | |
| test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE) | |
| # ===================== | |
| # Model | |
| # ===================== | |
| class SentimentModel(nn.Module): | |
| def __init__(self, vocab_size): | |
| super().__init__() | |
| self.embedding = nn.Embedding(vocab_size, EMBED_SIZE) | |
| self.fc = nn.Sequential( | |
| nn.Linear(EMBED_SIZE, 32), | |
| nn.ReLU(), | |
| nn.Linear(32, 1), | |
| nn.Sigmoid() | |
| ) | |
| def forward(self, x): | |
| x = self.embedding(x) | |
| x = x.mean(dim=1) | |
| x = self.fc(x) | |
| return x.squeeze() | |
| model = SentimentModel(vocab_size).to(device) | |
| # ===================== | |
| # Loss and optimizer | |
| # ===================== | |
| criterion = nn.BCELoss() | |
| optimizer = torch.optim.Adam(model.parameters(), lr=LR) | |
| # ===================== | |
| # Training loop | |
| # ===================== | |
| for epoch in range(EPOCHS): | |
| model.train() | |
| total_loss = 0 | |
| for x, y in train_loader: | |
| x, y = x.to(device), y.to(device) | |
| outputs = model(x) | |
| loss = criterion(outputs, y) | |
| optimizer.zero_grad() | |
| loss.backward() | |
| optimizer.step() | |
| total_loss += loss.item() | |
| print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}") | |
| # ===================== | |
| # Evaluation | |
| # ===================== | |
| model.eval() | |
| correct = 0 | |
| total = 0 | |
| with torch.no_grad(): | |
| for x, y in test_loader: | |
| x, y = x.to(device), y.to(device) | |
| outputs = model(x) | |
| predicted = (outputs > 0.5).float() | |
| correct += (predicted == y).sum().item() | |
| total += y.size(0) | |
| accuracy = correct / total | |
| print("Accuracy:", accuracy) | |
| torch.save({ | |
| "model_state_dict": model.state_dict(), | |
| "vocab": vocab | |
| }, "japanese_sentiment_model.pth") | |
| print("Model saved successfully.") |