Obb12 commited on
Commit
c1cfbf2
·
verified ·
1 Parent(s): 6cfef48

Upload 8 files

Browse files
app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from detect import predict_sentiment
3
+
4
+ def classify_text(text):
5
+ return predict_sentiment(text)
6
+
7
+ demo = gr.Interface(
8
+ fn=classify_text,
9
+ inputs=gr.Textbox(lines=2, placeholder="Enter Japanese text here..."),
10
+ outputs="text",
11
+ title="Japanese Sentiment Classifier",
12
+ description="Classifies Japanese text as Positive 😎 or Negative 😡 using a PyTorch model trained from scratch."
13
+ )
14
+
15
+ demo.launch()
convert.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import pandas as pd
3
+
4
+ # Load the dataset
5
+ dataset = load_dataset("mteb/JapaneseSentimentClassification")
6
+
7
+ # Convert each split to pandas DataFrame
8
+ for split in ["train", "test"]:
9
+ df = pd.DataFrame(dataset[split])
10
+ # Optional: rename columns if you like
11
+ df = df.rename(columns={"text": "text", "label": "label"})
12
+
13
+ # Save to CSV
14
+ df.to_csv(f"japanese_sentiment_{split}.csv", index=False, encoding="utf-8-sig")
15
+ print(f"{split} split saved: japanese_sentiment_{split}.csv")
japanese_sentiment_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba632690367b75cfa934f4207f187389a544a88a334fdfb3f1301b4d898c076
3
+ size 4775636
japanese_sentiment_test.csv ADDED
The diff for this file is too large to render. See raw diff
 
japanese_sentiment_train.csv ADDED
The diff for this file is too large to render. See raw diff
 
predict.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from janome.tokenizer import Tokenizer
4
+ import argparse
5
+
6
+ # =====================
7
+ # Settings
8
+ # =====================
9
+
10
+ MAX_LEN = 20
11
+ EMBED_SIZE = 64
12
+ MODEL_PATH = "japanese_sentiment_model.pth"
13
+
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+
16
+ # =====================
17
+ # Tokenizer
18
+ # =====================
19
+
20
+ tokenizer = Tokenizer()
21
+
22
+
23
+ def tokenize(text):
24
+ return [token.surface for token in tokenizer.tokenize(text)]
25
+
26
+
27
+ # =====================
28
+ # Model
29
+ # =====================
30
+
31
+ class SentimentModel(nn.Module):
32
+ def __init__(self, vocab_size):
33
+ super().__init__()
34
+ self.embedding = nn.Embedding(vocab_size, EMBED_SIZE)
35
+ self.fc = nn.Sequential(
36
+ nn.Linear(EMBED_SIZE, 32),
37
+ nn.ReLU(),
38
+ nn.Linear(32, 1),
39
+ nn.Sigmoid(),
40
+ )
41
+
42
+ def forward(self, x):
43
+ x = self.embedding(x)
44
+ x = x.mean(dim=1)
45
+ x = self.fc(x)
46
+ return x.squeeze()
47
+
48
+
49
+ # =====================
50
+ # Load model + vocab
51
+ # =====================
52
+
53
+ checkpoint = torch.load(MODEL_PATH, map_location=device)
54
+ vocab = checkpoint["vocab"]
55
+ model = SentimentModel(len(vocab)).to(device)
56
+ model.load_state_dict(checkpoint["model_state_dict"])
57
+ model.eval()
58
+
59
+ print("Model loaded successfully.")
60
+
61
+
62
+ def encode(text):
63
+ tokens = tokenize(text)
64
+ ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
65
+ if len(ids) < MAX_LEN:
66
+ ids += [vocab["<PAD>"]] * (MAX_LEN - len(ids))
67
+ else:
68
+ ids = ids[:MAX_LEN]
69
+ return ids
70
+
71
+
72
+ def predict(text):
73
+ x = torch.tensor([encode(text)], dtype=torch.long).to(device)
74
+ with torch.no_grad():
75
+ output = model(x).item()
76
+
77
+ if output > 0.5:
78
+ print(f"Positive {output:.4f} | {text}")
79
+ else:
80
+ print(f"Negative {output:.4f} | {text}")
81
+
82
+
83
+ if __name__ == "__main__":
84
+ parser = argparse.ArgumentParser(
85
+ description="Japanese sentiment prediction CLI using a saved PyTorch model."
86
+ )
87
+ parser.add_argument(
88
+ "text",
89
+ nargs="*",
90
+ help="Text to predict. If omitted, use --interactive.",
91
+ )
92
+ parser.add_argument(
93
+ "-i",
94
+ "--interactive",
95
+ action="store_true",
96
+ help="Interactive mode. Type text repeatedly (type 'exit' to quit).",
97
+ )
98
+
99
+ args = parser.parse_args()
100
+
101
+ if args.text:
102
+ predict(" ".join(args.text))
103
+ elif args.interactive:
104
+ while True:
105
+ text = input("text> ").strip()
106
+ if text.lower() in {"exit", "quit"}:
107
+ break
108
+ if text:
109
+ predict(text)
110
+ else:
111
+ parser.print_help()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ pandas
3
+ janome
4
+ scikit-learn
5
+ gradio
train.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.utils.data import Dataset, DataLoader
4
+ import pandas as pd
5
+ from janome.tokenizer import Tokenizer
6
+ from sklearn.model_selection import train_test_split
7
+
8
+ # =====================
9
+ # Settings
10
+ # =====================
11
+
12
+ MAX_LEN = 20
13
+ BATCH_SIZE = 32
14
+ EMBED_SIZE = 64
15
+ EPOCHS = 100
16
+ LR = 0.05
17
+
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+
20
+ # =====================
21
+ # Tokenizer
22
+ # =====================
23
+
24
+ tokenizer = Tokenizer()
25
+
26
+ def tokenize(text):
27
+ return [token.surface for token in tokenizer.tokenize(text)]
28
+
29
+ # =====================
30
+ # Load dataset
31
+ # =====================
32
+
33
+ train_df = pd.read_csv("japanese_sentiment_train.csv")
34
+ test_df = pd.read_csv("japanese_sentiment_test.csv") # separate test set
35
+
36
+ train_texts = train_df["text"].tolist()
37
+ train_labels = train_df["label"].tolist()
38
+
39
+ test_texts = test_df["text"].tolist()
40
+ test_labels = test_df["label"].tolist()
41
+ # =====================
42
+ # Build vocabulary
43
+ # =====================
44
+
45
+ vocab = {"<PAD>": 0, "<UNK>": 1}
46
+
47
+ for text in texts:
48
+ for token in tokenize(text):
49
+ if token not in vocab:
50
+ vocab[token] = len(vocab)
51
+
52
+ vocab_size = len(vocab)
53
+
54
+ print("Vocab size:", vocab_size)
55
+
56
+ # =====================
57
+ # Convert text to tensor
58
+ # =====================
59
+
60
+ def encode(text):
61
+ tokens = tokenize(text)
62
+ ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
63
+
64
+ # padding
65
+ if len(ids) < MAX_LEN:
66
+ ids += [0] * (MAX_LEN - len(ids))
67
+ else:
68
+ ids = ids[:MAX_LEN]
69
+
70
+ return ids
71
+
72
+ # =====================
73
+ # Dataset class
74
+ # =====================
75
+
76
+ class JapaneseDataset(Dataset):
77
+ def __init__(self, texts, labels):
78
+ self.texts = texts
79
+ self.labels = labels
80
+
81
+ def __len__(self):
82
+ return len(self.texts)
83
+
84
+ def __getitem__(self, idx):
85
+ x = torch.tensor(encode(self.texts[idx]), dtype=torch.long)
86
+ y = torch.tensor(self.labels[idx], dtype=torch.float32)
87
+ return x, y
88
+
89
+ # =====================
90
+ # Train/test split
91
+ # =====================
92
+
93
+ train_dataset = JapaneseDataset(train_texts, train_labels)
94
+ test_dataset = JapaneseDataset(test_texts, test_labels)
95
+
96
+ train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
97
+ test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
98
+ # =====================
99
+ # Model
100
+ # =====================
101
+
102
+ class SentimentModel(nn.Module):
103
+
104
+ def __init__(self, vocab_size):
105
+ super().__init__()
106
+
107
+ self.embedding = nn.Embedding(vocab_size, EMBED_SIZE)
108
+
109
+ self.fc = nn.Sequential(
110
+ nn.Linear(EMBED_SIZE, 32),
111
+ nn.ReLU(),
112
+ nn.Linear(32, 1),
113
+
114
+ nn.Sigmoid()
115
+ )
116
+
117
+ def forward(self, x):
118
+
119
+ x = self.embedding(x)
120
+
121
+ x = x.mean(dim=1)
122
+
123
+ x = self.fc(x)
124
+
125
+ return x.squeeze()
126
+
127
+ model = SentimentModel(vocab_size).to(device)
128
+
129
+ # =====================
130
+ # Loss and optimizer
131
+ # =====================
132
+
133
+ criterion = nn.BCELoss()
134
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR)
135
+
136
+ # =====================
137
+ # Training loop
138
+ # =====================
139
+
140
+ for epoch in range(EPOCHS):
141
+
142
+ model.train()
143
+ total_loss = 0
144
+
145
+ for x, y in train_loader:
146
+
147
+ x, y = x.to(device), y.to(device)
148
+
149
+ outputs = model(x)
150
+
151
+ loss = criterion(outputs, y)
152
+
153
+ optimizer.zero_grad()
154
+ loss.backward()
155
+ optimizer.step()
156
+
157
+ total_loss += loss.item()
158
+
159
+ print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
160
+
161
+ # =====================
162
+ # Evaluation
163
+ # =====================
164
+
165
+ model.eval()
166
+
167
+ correct = 0
168
+ total = 0
169
+
170
+ with torch.no_grad():
171
+
172
+ for x, y in test_loader:
173
+
174
+ x, y = x.to(device), y.to(device)
175
+
176
+ outputs = model(x)
177
+
178
+ predicted = (outputs > 0.5).float()
179
+
180
+ correct += (predicted == y).sum().item()
181
+ total += y.size(0)
182
+
183
+ accuracy = correct / total
184
+
185
+ print("Accuracy:", accuracy)
186
+
187
+
188
+
189
+ torch.save({
190
+ "model_state_dict": model.state_dict(),
191
+ "vocab": vocab
192
+ }, "japanese_sentiment_model.pth")
193
+
194
+ print("Model saved successfully.")