Spaces:

Obb12
/

Japanese_sentiment

Sleeping

App Files Files Community

Obb12 commited on Feb 22

Commit

c1cfbf2

verified ·

1 Parent(s): 6cfef48

Upload 8 files

Browse files

Files changed (8) hide show

app.py +15 -0
convert.py +15 -0
japanese_sentiment_model.pth +3 -0
japanese_sentiment_test.csv +0 -0
japanese_sentiment_train.csv +0 -0
predict.py +111 -0
requirements.txt +5 -0
train.py +194 -0

app.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import gradio as gr
+from detect import predict_sentiment
+def classify_text(text):
+    return predict_sentiment(text)
+demo = gr.Interface(
+    fn=classify_text,
+    inputs=gr.Textbox(lines=2, placeholder="Enter Japanese text here..."),
+    outputs="text",
+    title="Japanese Sentiment Classifier",
+    description="Classifies Japanese text as Positive 😎 or Negative 😡 using a PyTorch model trained from scratch."
+)
+demo.launch()

convert.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from datasets import load_dataset
+import pandas as pd
+# Load the dataset
+dataset = load_dataset("mteb/JapaneseSentimentClassification")
+# Convert each split to pandas DataFrame
+for split in ["train", "test"]:
+    df = pd.DataFrame(dataset[split])
+    # Optional: rename columns if you like
+    df = df.rename(columns={"text": "text", "label": "label"})
+    # Save to CSV
+    df.to_csv(f"japanese_sentiment_{split}.csv", index=False, encoding="utf-8-sig")
+    print(f"{split} split saved: japanese_sentiment_{split}.csv")

japanese_sentiment_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fba632690367b75cfa934f4207f187389a544a88a334fdfb3f1301b4d898c076
+size 4775636

japanese_sentiment_test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

japanese_sentiment_train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

predict.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+import torch.nn as nn
+from janome.tokenizer import Tokenizer
+import argparse
+# =====================
+# Settings
+# =====================
+MAX_LEN = 20
+EMBED_SIZE = 64
+MODEL_PATH = "japanese_sentiment_model.pth"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# =====================
+# Tokenizer
+# =====================
+tokenizer = Tokenizer()
+def tokenize(text):
+    return [token.surface for token in tokenizer.tokenize(text)]
+# =====================
+# Model
+# =====================
+class SentimentModel(nn.Module):
+    def __init__(self, vocab_size):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, EMBED_SIZE)
+        self.fc = nn.Sequential(
+            nn.Linear(EMBED_SIZE, 32),
+            nn.ReLU(),
+            nn.Linear(32, 1),
+            nn.Sigmoid(),
+        )
+    def forward(self, x):
+        x = self.embedding(x)
+        x = x.mean(dim=1)
+        x = self.fc(x)
+        return x.squeeze()
+# =====================
+# Load model + vocab
+# =====================
+checkpoint = torch.load(MODEL_PATH, map_location=device)
+vocab = checkpoint["vocab"]
+model = SentimentModel(len(vocab)).to(device)
+model.load_state_dict(checkpoint["model_state_dict"])
+model.eval()
+print("Model loaded successfully.")
+def encode(text):
+    tokens = tokenize(text)
+    ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
+    if len(ids) < MAX_LEN:
+        ids += [vocab["<PAD>"]] * (MAX_LEN - len(ids))
+    else:
+        ids = ids[:MAX_LEN]
+    return ids
+def predict(text):
+    x = torch.tensor([encode(text)], dtype=torch.long).to(device)
+    with torch.no_grad():
+        output = model(x).item()
+    if output > 0.5:
+        print(f"Positive {output:.4f} | {text}")
+    else:
+        print(f"Negative {output:.4f} | {text}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Japanese sentiment prediction CLI using a saved PyTorch model."
+    )
+    parser.add_argument(
+        "text",
+        nargs="*",
+        help="Text to predict. If omitted, use --interactive.",
+    )
+    parser.add_argument(
+        "-i",
+        "--interactive",
+        action="store_true",
+        help="Interactive mode. Type text repeatedly (type 'exit' to quit).",
+    )
+    args = parser.parse_args()
+    if args.text:
+        predict(" ".join(args.text))
+    elif args.interactive:
+        while True:
+            text = input("text> ").strip()
+            if text.lower() in {"exit", "quit"}:
+                break
+            if text:
+                predict(text)
+    else:
+        parser.print_help()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+pandas
+janome
+scikit-learn
+gradio

train.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+import pandas as pd
+from janome.tokenizer import Tokenizer
+from sklearn.model_selection import train_test_split
+# =====================
+# Settings
+# =====================
+MAX_LEN = 20
+BATCH_SIZE = 32
+EMBED_SIZE = 64
+EPOCHS = 100
+LR = 0.05
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# =====================
+# Tokenizer
+# =====================
+tokenizer = Tokenizer()
+def tokenize(text):
+    return [token.surface for token in tokenizer.tokenize(text)]
+# =====================
+# Load dataset
+# =====================
+train_df = pd.read_csv("japanese_sentiment_train.csv")
+test_df = pd.read_csv("japanese_sentiment_test.csv")  # separate test set
+train_texts = train_df["text"].tolist()
+train_labels = train_df["label"].tolist()
+test_texts = test_df["text"].tolist()
+test_labels = test_df["label"].tolist()
+# =====================
+# Build vocabulary
+# =====================
+vocab = {"<PAD>": 0, "<UNK>": 1}
+for text in texts:
+    for token in tokenize(text):
+        if token not in vocab:
+            vocab[token] = len(vocab)
+vocab_size = len(vocab)
+print("Vocab size:", vocab_size)
+# =====================
+# Convert text to tensor
+# =====================
+def encode(text):
+    tokens = tokenize(text)
+    ids = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
+    # padding
+    if len(ids) < MAX_LEN:
+        ids += [0] * (MAX_LEN - len(ids))
+    else:
+        ids = ids[:MAX_LEN]
+    return ids
+# =====================
+# Dataset class
+# =====================
+class JapaneseDataset(Dataset):
+    def __init__(self, texts, labels):
+        self.texts = texts
+        self.labels = labels
+    def __len__(self):
+        return len(self.texts)
+    def __getitem__(self, idx):
+        x = torch.tensor(encode(self.texts[idx]), dtype=torch.long)
+        y = torch.tensor(self.labels[idx], dtype=torch.float32)
+        return x, y
+# =====================
+# Train/test split
+# =====================
+train_dataset = JapaneseDataset(train_texts, train_labels)
+test_dataset = JapaneseDataset(test_texts, test_labels)
+train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
+# =====================
+# Model
+# =====================
+class SentimentModel(nn.Module):
+    def __init__(self, vocab_size):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, EMBED_SIZE)
+        self.fc = nn.Sequential(
+            nn.Linear(EMBED_SIZE, 32),
+            nn.ReLU(),
+            nn.Linear(32, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        x = self.embedding(x)
+        x = x.mean(dim=1)
+        x = self.fc(x)
+        return x.squeeze()
+model = SentimentModel(vocab_size).to(device)
+# =====================
+# Loss and optimizer
+# =====================
+criterion = nn.BCELoss()
+optimizer = torch.optim.Adam(model.parameters(), lr=LR)
+# =====================
+# Training loop
+# =====================
+for epoch in range(EPOCHS):
+    model.train()
+    total_loss = 0
+    for x, y in train_loader:
+        x, y = x.to(device), y.to(device)
+        outputs = model(x)
+        loss = criterion(outputs, y)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
+# =====================
+# Evaluation
+# =====================
+model.eval()
+correct = 0
+total = 0
+with torch.no_grad():
+    for x, y in test_loader:
+        x, y = x.to(device), y.to(device)
+        outputs = model(x)
+        predicted = (outputs > 0.5).float()
+        correct += (predicted == y).sum().item()
+        total += y.size(0)
+accuracy = correct / total
+print("Accuracy:", accuracy)
+torch.save({
+    "model_state_dict": model.state_dict(),
+    "vocab": vocab
+}, "japanese_sentiment_model.pth")
+print("Model saved successfully.")