|
|
import torch
|
|
|
import torch.nn as nn
|
|
|
import torch.optim as optim
|
|
|
import pandas as pd
|
|
|
from collections import Counter
|
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
from torch.utils.data import Dataset, DataLoader
|
|
|
import pickle
|
|
|
import re
|
|
|
from nltk.corpus import stopwords
|
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
import gradio as gr
|
|
|
import os
|
|
|
import nltk
|
|
|
|
|
|
|
|
|
nltk.download("stopwords", quiet=True)
|
|
|
nltk.download("wordnet", quiet=True)
|
|
|
|
|
|
|
|
|
stop_words = set(stopwords.words("english"))
|
|
|
lemmatizer = WordNetLemmatizer()
|
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
|
|
|
class AmazonReviewDataset(Dataset):
|
|
|
def __init__(self, csv_file, max_length=50, sample_fraction=0.01, max_vocab_size=5000):
|
|
|
|
|
|
print("Loading dataset from:", csv_file)
|
|
|
self.data = pd.read_csv(csv_file, header=None, names=["label", "title", "text"])
|
|
|
self.data = self.data.sample(frac=sample_fraction, random_state=42).reset_index(drop=True)
|
|
|
print(f"Using {len(self.data)} samples ({sample_fraction * 100:.2f}% of the dataset).")
|
|
|
|
|
|
|
|
|
self.data["text"] = self.data["text"].apply(self.clean_text)
|
|
|
|
|
|
|
|
|
self.max_length = max_length
|
|
|
self.vocab = {"<PAD>": 0, "<UNK>": 1}
|
|
|
self.label_encoder = LabelEncoder()
|
|
|
|
|
|
|
|
|
print("Building vocabulary...")
|
|
|
self._build_vocab(max_vocab_size)
|
|
|
print("Vocabulary built successfully.")
|
|
|
|
|
|
|
|
|
self.label_encoder.fit(self.data["label"])
|
|
|
|
|
|
def clean_text(self, text):
|
|
|
|
|
|
text = re.sub(r"[^a-zA-Z\s]", "", text)
|
|
|
|
|
|
text = text.lower()
|
|
|
|
|
|
text = " ".join([word for word in text.split() if word not in stop_words])
|
|
|
|
|
|
text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
|
|
|
return text
|
|
|
|
|
|
def _build_vocab(self, max_vocab_size):
|
|
|
|
|
|
all_text = self.data["title"].astype(str) + " " + self.data["text"].astype(str)
|
|
|
all_text = all_text.fillna("")
|
|
|
all_text = all_text[:50000]
|
|
|
|
|
|
|
|
|
token_counts = Counter()
|
|
|
chunk_size = 5000
|
|
|
for i in range(0, len(all_text), chunk_size):
|
|
|
chunk = all_text[i:i + chunk_size]
|
|
|
tokens = " ".join(chunk).split()
|
|
|
token_counts.update(tokens)
|
|
|
print(f"Processed {min(i + chunk_size, len(all_text))} rows...")
|
|
|
|
|
|
|
|
|
most_common_tokens = [token for token, _ in token_counts.most_common(max_vocab_size)]
|
|
|
for token in most_common_tokens:
|
|
|
self.vocab[token] = len(self.vocab)
|
|
|
|
|
|
def __len__(self):
|
|
|
return len(self.data)
|
|
|
|
|
|
def __getitem__(self, idx):
|
|
|
label = self.data.iloc[idx]["label"]
|
|
|
title = str(self.data.iloc[idx]["title"])
|
|
|
text = str(self.data.iloc[idx]["text"])
|
|
|
combined_text = title + " " + text
|
|
|
tokens = combined_text.split()[:self.max_length]
|
|
|
token_ids = [self.vocab.get(token, self.vocab["<UNK>"]) for token in tokens]
|
|
|
padding = [self.vocab["<PAD>"]] * (self.max_length - len(token_ids))
|
|
|
token_ids += padding
|
|
|
label_encoded = self.label_encoder.transform([label])[0]
|
|
|
return torch.tensor(token_ids, dtype=torch.long).to(device), torch.tensor(label_encoded, dtype=torch.long).to(device)
|
|
|
|
|
|
|
|
|
|
|
|
class PolicyNetwork(nn.Module):
|
|
|
def __init__(self, vocab_size, embed_dim=32, hidden_dim=128, num_classes=2):
|
|
|
super(PolicyNetwork, self).__init__()
|
|
|
self.embedding = nn.Embedding(vocab_size, embed_dim)
|
|
|
self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
|
|
|
self.fc = nn.Linear(hidden_dim * 2, num_classes)
|
|
|
|
|
|
def forward(self, x):
|
|
|
embedded = self.embedding(x)
|
|
|
lstm_out, _ = self.lstm(embedded)
|
|
|
out = self.fc(lstm_out[:, -1, :])
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
def train_rl_model(dataset, policy_net, optimizer, num_episodes=3, entropy_weight=0.01, lr=0.001, batch_size=16):
|
|
|
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)
|
|
|
for episode in range(num_episodes):
|
|
|
print(f"Episode {episode + 1} started.")
|
|
|
total_reward = 0
|
|
|
for batch in dataloader:
|
|
|
tokenized_reviews, true_labels = batch
|
|
|
logits = policy_net(tokenized_reviews)
|
|
|
probs = torch.softmax(logits, dim=-1)
|
|
|
actions = torch.multinomial(probs, 1).squeeze()
|
|
|
|
|
|
|
|
|
rewards = [1 if action == label else -1 for action, label in zip(actions, true_labels)]
|
|
|
rewards_tensor = torch.tensor(rewards, dtype=torch.float32).to(device)
|
|
|
rewards_tensor = (rewards_tensor - rewards_tensor.mean()) / (rewards_tensor.std() + 1e-8)
|
|
|
|
|
|
|
|
|
loss = 0
|
|
|
entropy_loss = 0
|
|
|
for i, action in enumerate(actions):
|
|
|
log_prob = torch.log(probs[i, action] + 1e-8)
|
|
|
loss += -log_prob * rewards_tensor[i]
|
|
|
entropy_loss += -(probs[i] * torch.log(probs[i] + 1e-8)).sum()
|
|
|
|
|
|
loss += entropy_weight * entropy_loss
|
|
|
|
|
|
|
|
|
optimizer.zero_grad()
|
|
|
loss.backward()
|
|
|
torch.nn.utils.clip_grad_norm_(policy_net.parameters(), max_norm=1.0)
|
|
|
optimizer.step()
|
|
|
|
|
|
total_reward += sum(rewards)
|
|
|
|
|
|
print(f"Episode {episode + 1}, Total Reward: {total_reward}, Loss: {loss.item()}")
|
|
|
|
|
|
|
|
|
with open("policy_net.pkl", "wb") as f:
|
|
|
pickle.dump(policy_net.state_dict(), f)
|
|
|
print("Model saved successfully as policy_net.pkl")
|
|
|
|
|
|
|
|
|
|
|
|
def evaluate_model(dataset, policy_net):
|
|
|
dataloader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=4)
|
|
|
correct = 0
|
|
|
total = 0
|
|
|
policy_net.eval()
|
|
|
with torch.no_grad():
|
|
|
for batch in dataloader:
|
|
|
tokenized_reviews, true_labels = batch
|
|
|
logits = policy_net(tokenized_reviews)
|
|
|
probs = torch.softmax(logits, dim=-1)
|
|
|
predicted_classes = torch.argmax(probs, dim=-1)
|
|
|
correct += (predicted_classes == true_labels).sum().item()
|
|
|
total += true_labels.size(0)
|
|
|
accuracy = correct / total
|
|
|
print(f"Accuracy: {accuracy * 100:.2f}%")
|
|
|
return accuracy
|
|
|
|
|
|
|
|
|
|
|
|
def predict_review(review_text):
|
|
|
with open("vocab.pkl", "rb") as f:
|
|
|
vocab = pickle.load(f)
|
|
|
with open("label_encoder.pkl", "rb") as f:
|
|
|
label_encoder = pickle.load(f)
|
|
|
|
|
|
tokenized_input = review_text.split()[:50]
|
|
|
token_ids = [vocab.get(word, vocab["<UNK>"]) for word in tokenized_input]
|
|
|
padding = [vocab["<PAD>"]] * (50 - len(token_ids))
|
|
|
token_ids += padding
|
|
|
token_ids = torch.tensor(token_ids).unsqueeze(0).to(device)
|
|
|
|
|
|
policy_net = PolicyNetwork(len(vocab), embed_dim=32, hidden_dim=128, num_classes=2).to(device)
|
|
|
with open("policy_net.pkl", "rb") as f:
|
|
|
policy_net.load_state_dict(pickle.load(f))
|
|
|
policy_net.eval()
|
|
|
|
|
|
with torch.no_grad():
|
|
|
logits = policy_net(token_ids)
|
|
|
probs = torch.softmax(logits, dim=-1)
|
|
|
predicted_class = torch.argmax(probs, dim=-1).item()
|
|
|
predicted_label = label_encoder.inverse_transform([predicted_class])[0]
|
|
|
return predicted_label
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
train_csv_path = r"D:\b\train.csv"
|
|
|
test_csv_path = r"D:\b\test.csv"
|
|
|
sample_fraction = 0.01
|
|
|
max_vocab_size = 5000
|
|
|
num_episodes = 3
|
|
|
batch_size = 16
|
|
|
lr = 0.001
|
|
|
entropy_weight = 0.01
|
|
|
|
|
|
|
|
|
train_dataset = AmazonReviewDataset(train_csv_path, sample_fraction=sample_fraction, max_vocab_size=max_vocab_size)
|
|
|
test_dataset = AmazonReviewDataset(test_csv_path, sample_fraction=sample_fraction, max_vocab_size=max_vocab_size)
|
|
|
print("Dataset loaded successfully.")
|
|
|
|
|
|
|
|
|
policy_net = PolicyNetwork(len(train_dataset.vocab), embed_dim=32, hidden_dim=128, num_classes=2).to(device)
|
|
|
optimizer = optim.Adam(policy_net.parameters(), lr=lr)
|
|
|
|
|
|
|
|
|
train_rl_model(train_dataset, policy_net, optimizer, num_episodes=num_episodes, entropy_weight=entropy_weight, lr=lr, batch_size=batch_size)
|
|
|
|
|
|
|
|
|
evaluate_model(test_dataset, policy_net)
|
|
|
|
|
|
|
|
|
with open("vocab.pkl", "wb") as f:
|
|
|
pickle.dump(train_dataset.vocab, f)
|
|
|
with open("label_encoder.pkl", "wb") as f:
|
|
|
pickle.dump(train_dataset.label_encoder, f)
|
|
|
print("Vocabulary and label encoder saved successfully.")
|
|
|
|
|
|
|
|
|
iface = gr.Interface(
|
|
|
fn=predict_review,
|
|
|
inputs="text",
|
|
|
outputs="text",
|
|
|
title="Amazon Review Sentiment Analysis",
|
|
|
description="Enter a review to predict its sentiment (Positive/Negative)." )
|
|
|
|
|
|
iface.launch(share=True) |