English
harshjoshi2211 commited on
Commit
1cc7312
·
verified ·
1 Parent(s): e3d641e

Upload 8 files

Browse files
Files changed (8) hide show
  1. .gitattributes +5 -35
  2. app.py +242 -0
  3. label_encoder.pkl +0 -0
  4. model1.pth +3 -0
  5. policy_net.pkl +3 -0
  6. readme.txt +16 -0
  7. requirements.txt +7 -0
  8. vocab.pkl +0 -0
.gitattributes CHANGED
@@ -1,35 +1,5 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ train.csv filter=lfs diff=lfs merge=lfs -text
2
+ test.csv filter=lfs diff=lfs merge=lfs -text
3
+ *.csv filter=lfs diff=lfs merge=lfs -text
4
+ model1.pth filter=lfs diff=lfs merge=lfs -text
5
+ policy_net.pkl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ import pandas as pd
5
+ from collections import Counter
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from torch.utils.data import Dataset, DataLoader
8
+ import pickle
9
+ import re
10
+ from nltk.corpus import stopwords
11
+ from nltk.stem import WordNetLemmatizer
12
+ import gradio as gr
13
+ import os
14
+ import nltk
15
+
16
+ # Download NLTK resources
17
+ nltk.download("stopwords", quiet=True)
18
+ nltk.download("wordnet", quiet=True)
19
+
20
+ # Initialize stopwords and lemmatizer globally
21
+ stop_words = set(stopwords.words("english"))
22
+ lemmatizer = WordNetLemmatizer()
23
+
24
+ # Device configuration
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+
27
+ # Dataset Class
28
+ class AmazonReviewDataset(Dataset):
29
+ def __init__(self, csv_file, max_length=50, sample_fraction=0.01, max_vocab_size=5000):
30
+ # Load dataset
31
+ print("Loading dataset from:", csv_file)
32
+ self.data = pd.read_csv(csv_file, header=None, names=["label", "title", "text"])
33
+ self.data = self.data.sample(frac=sample_fraction, random_state=42).reset_index(drop=True)
34
+ print(f"Using {len(self.data)} samples ({sample_fraction * 100:.2f}% of the dataset).")
35
+
36
+ # Clean text data
37
+ self.data["text"] = self.data["text"].apply(self.clean_text)
38
+
39
+ # Parameters
40
+ self.max_length = max_length
41
+ self.vocab = {"<PAD>": 0, "<UNK>": 1}
42
+ self.label_encoder = LabelEncoder()
43
+
44
+ # Build vocabulary
45
+ print("Building vocabulary...")
46
+ self._build_vocab(max_vocab_size)
47
+ print("Vocabulary built successfully.")
48
+
49
+ # Fit the label encoder
50
+ self.label_encoder.fit(self.data["label"])
51
+
52
+ def clean_text(self, text):
53
+ # Remove special characters and numbers
54
+ text = re.sub(r"[^a-zA-Z\s]", "", text)
55
+ # Convert to lowercase
56
+ text = text.lower()
57
+ # Remove stopwords
58
+ text = " ".join([word for word in text.split() if word not in stop_words])
59
+ # Apply lemmatization
60
+ text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
61
+ return text
62
+
63
+ def _build_vocab(self, max_vocab_size):
64
+ # Combine title and text columns
65
+ all_text = self.data["title"].astype(str) + " " + self.data["text"].astype(str)
66
+ all_text = all_text.fillna("") # Ensure no NaN values
67
+ all_text = all_text[:50000] # Use only the first 50,000 rows
68
+
69
+ # Tokenize and build vocabulary in smaller chunks
70
+ token_counts = Counter()
71
+ chunk_size = 5000 # Process smaller chunks
72
+ for i in range(0, len(all_text), chunk_size):
73
+ chunk = all_text[i:i + chunk_size]
74
+ tokens = " ".join(chunk).split() # Tokenize the chunk
75
+ token_counts.update(tokens)
76
+ print(f"Processed {min(i + chunk_size, len(all_text))} rows...")
77
+
78
+ # Keep only the most common tokens
79
+ most_common_tokens = [token for token, _ in token_counts.most_common(max_vocab_size)]
80
+ for token in most_common_tokens:
81
+ self.vocab[token] = len(self.vocab)
82
+
83
+ def __len__(self):
84
+ return len(self.data)
85
+
86
+ def __getitem__(self, idx):
87
+ label = self.data.iloc[idx]["label"]
88
+ title = str(self.data.iloc[idx]["title"])
89
+ text = str(self.data.iloc[idx]["text"])
90
+ combined_text = title + " " + text # Concatenate title and text
91
+ tokens = combined_text.split()[:self.max_length] # Tokenize and truncate
92
+ token_ids = [self.vocab.get(token, self.vocab["<UNK>"]) for token in tokens] # Convert tokens to IDs
93
+ padding = [self.vocab["<PAD>"]] * (self.max_length - len(token_ids)) # Add padding
94
+ token_ids += padding
95
+ label_encoded = self.label_encoder.transform([label])[0] # Encode label
96
+ return torch.tensor(token_ids, dtype=torch.long).to(device), torch.tensor(label_encoded, dtype=torch.long).to(device)
97
+
98
+
99
+ # Policy Network
100
+ class PolicyNetwork(nn.Module):
101
+ def __init__(self, vocab_size, embed_dim=32, hidden_dim=128, num_classes=2):
102
+ super(PolicyNetwork, self).__init__()
103
+ self.embedding = nn.Embedding(vocab_size, embed_dim)
104
+ self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
105
+ self.fc = nn.Linear(hidden_dim * 2, num_classes) # Bidirectional LSTM doubles hidden size
106
+
107
+ def forward(self, x):
108
+ embedded = self.embedding(x)
109
+ lstm_out, _ = self.lstm(embedded)
110
+ out = self.fc(lstm_out[:, -1, :]) # Use the last hidden state
111
+ return out
112
+
113
+
114
+ # Training Function
115
+ def train_rl_model(dataset, policy_net, optimizer, num_episodes=3, entropy_weight=0.01, lr=0.001, batch_size=16):
116
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)
117
+ for episode in range(num_episodes):
118
+ print(f"Episode {episode + 1} started.")
119
+ total_reward = 0
120
+ for batch in dataloader:
121
+ tokenized_reviews, true_labels = batch
122
+ logits = policy_net(tokenized_reviews)
123
+ probs = torch.softmax(logits, dim=-1)
124
+ actions = torch.multinomial(probs, 1).squeeze()
125
+
126
+ # Define rewards based on correctness
127
+ rewards = [1 if action == label else -1 for action, label in zip(actions, true_labels)]
128
+ rewards_tensor = torch.tensor(rewards, dtype=torch.float32).to(device)
129
+ rewards_tensor = (rewards_tensor - rewards_tensor.mean()) / (rewards_tensor.std() + 1e-8) # Normalize rewards
130
+
131
+ # Compute loss
132
+ loss = 0
133
+ entropy_loss = 0
134
+ for i, action in enumerate(actions):
135
+ log_prob = torch.log(probs[i, action] + 1e-8)
136
+ loss += -log_prob * rewards_tensor[i]
137
+ entropy_loss += -(probs[i] * torch.log(probs[i] + 1e-8)).sum()
138
+
139
+ loss += entropy_weight * entropy_loss
140
+
141
+ # Backpropagation
142
+ optimizer.zero_grad()
143
+ loss.backward()
144
+ torch.nn.utils.clip_grad_norm_(policy_net.parameters(), max_norm=1.0)
145
+ optimizer.step()
146
+
147
+ total_reward += sum(rewards)
148
+
149
+ print(f"Episode {episode + 1}, Total Reward: {total_reward}, Loss: {loss.item()}")
150
+
151
+ # Save the trained model as model1.pth
152
+ torch.save(policy_net.state_dict(), "model1.pth")
153
+ print("Model saved successfully as model1.pth")
154
+
155
+
156
+ # Evaluation Function
157
+ def evaluate_model(dataset, policy_net):
158
+ dataloader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=4)
159
+ correct = 0
160
+ total = 0
161
+ policy_net.eval()
162
+ with torch.no_grad():
163
+ for batch in dataloader:
164
+ tokenized_reviews, true_labels = batch
165
+ logits = policy_net(tokenized_reviews)
166
+ probs = torch.softmax(logits, dim=-1)
167
+ predicted_classes = torch.argmax(probs, dim=-1)
168
+ correct += (predicted_classes == true_labels).sum().item()
169
+ total += true_labels.size(0)
170
+ accuracy = correct / total
171
+ print(f"Accuracy: {accuracy * 100:.2f}%")
172
+ return accuracy
173
+
174
+
175
+ # Prediction Function for Gradio
176
+ def predict_review(review_text):
177
+ with open("vocab.pkl", "rb") as f:
178
+ vocab = pickle.load(f)
179
+ with open("label_encoder.pkl", "rb") as f:
180
+ label_encoder = pickle.load(f)
181
+
182
+ tokenized_input = review_text.split()[:50] # Limit to max length
183
+ token_ids = [vocab.get(word, vocab["<UNK>"]) for word in tokenized_input]
184
+ padding = [vocab["<PAD>"]] * (50 - len(token_ids)) # Pad if shorter than max length
185
+ token_ids += padding
186
+ token_ids = torch.tensor(token_ids).unsqueeze(0).to(device)
187
+
188
+ policy_net = PolicyNetwork(len(vocab), embed_dim=32, hidden_dim=128, num_classes=2).to(device)
189
+ policy_net.load_state_dict(torch.load("model1.pth"))
190
+ policy_net.eval()
191
+
192
+ with torch.no_grad():
193
+ logits = policy_net(token_ids)
194
+ probs = torch.softmax(logits, dim=-1)
195
+ predicted_class = torch.argmax(probs, dim=-1).item()
196
+ predicted_label = label_encoder.inverse_transform([predicted_class])[0]
197
+ return predicted_label
198
+
199
+
200
+ # Main Program
201
+ if __name__ == "__main__":
202
+ train_csv_path = r"D:\b\train.csv"
203
+ test_csv_path = r"D:\b\test.csv"
204
+ sample_fraction = 0.01
205
+ max_vocab_size = 5000
206
+ num_episodes = 3
207
+ batch_size = 16
208
+ lr = 0.001
209
+ entropy_weight = 0.01
210
+
211
+ # Initialize datasets
212
+ train_dataset = AmazonReviewDataset(train_csv_path, sample_fraction=sample_fraction, max_vocab_size=max_vocab_size)
213
+ test_dataset = AmazonReviewDataset(test_csv_path, sample_fraction=sample_fraction, max_vocab_size=max_vocab_size)
214
+ print("Dataset loaded successfully.")
215
+
216
+ # Initialize model and optimizer
217
+ policy_net = PolicyNetwork(len(train_dataset.vocab), embed_dim=32, hidden_dim=128, num_classes=2).to(device)
218
+ optimizer = optim.Adam(policy_net.parameters(), lr=lr)
219
+
220
+ # Train the model
221
+ train_rl_model(train_dataset, policy_net, optimizer, num_episodes=num_episodes, entropy_weight=entropy_weight, lr=lr, batch_size=batch_size)
222
+
223
+ # Evaluate the model
224
+ evaluate_model(test_dataset, policy_net)
225
+
226
+ # Save vocabulary and label encoder
227
+ with open("vocab.pkl", "wb") as f:
228
+ pickle.dump(train_dataset.vocab, f)
229
+ with open("label_encoder.pkl", "wb") as f:
230
+ pickle.dump(train_dataset.label_encoder, f)
231
+ print("Vocabulary and label encoder saved successfully.")
232
+
233
+ # Launch Gradio interface
234
+ iface = gr.Interface(
235
+ fn=predict_review,
236
+ inputs="text",
237
+ outputs="text",
238
+ title="Amazon Review Sentiment Analysis",
239
+ description="Enter a review to predict its sentiment (Positive/Negative)."
240
+ )
241
+
242
+ iface.launch(share=True)
label_encoder.pkl ADDED
Binary file (257 Bytes). View file
 
model1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ad43e1a269484e1f312b178fcb30b3ca34e908629d01d059563be286e3066ef
3
+ size 1309158
policy_net.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cf5ca063f35b94a4c05a40388b319941d914276000fc385f7443ecf524d5095
3
+ size 1309513
readme.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Amazon Review Polaridy Dataset
2
+
3
+ Version 3, Updated 09/09/2015
4
+
5
+ ORIGIN
6
+
7
+ The Amazon reviews dataset consists of reviews from amazon. The data span a period of 18 years, including ~35 million reviews up to March 2013. Reviews include product and user information, ratings, and a plaintext review. For more information, please refer to the following paper: J. McAuley and J. Leskovec. Hidden factors and hidden topics: understanding rating dimensions with review text. RecSys, 2013.
8
+
9
+ The Amazon reviews polarity dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the above dataset. It is used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015).
10
+
11
+
12
+ DESCRIPTION
13
+
14
+ The Amazon reviews polarity dataset is constructed by taking review score 1 and 2 as negative, and 4 and 5 as positive. Samples of score 3 is ignored. In the dataset, class 1 is the negative and class 2 is the positive. Each class has 1,800,000 training samples and 200,000 testing samples.
15
+
16
+ The files train.csv and test.csv contain all the training samples as comma-sparated values. There are 3 columns in them, corresponding to class index (1 or 2), review title and review text. The review title and text are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes (""). New lines are escaped by a backslash followed with an "n" character, that is "\n".
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ torch
3
+ pandas
4
+ scikit-learn
5
+ nltk
6
+ gradio
7
+ huggingface_hub
vocab.pkl ADDED
Binary file (60.6 kB). View file