harshjoshi2211 commited on
Commit
78b915b
·
verified ·
1 Parent(s): f74441b

Upload 6 files

Browse files
Files changed (5) hide show
  1. .gitattributes +5 -35
  2. app.py +243 -0
  3. label_encoder.pkl +0 -0
  4. policy_net.pkl +3 -0
  5. requirements.txt +7 -0
.gitattributes CHANGED
@@ -1,35 +1,5 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ train.csv filter=lfs diff=lfs merge=lfs -text
2
+ test.csv filter=lfs diff=lfs merge=lfs -text
3
+ *.csv filter=lfs diff=lfs merge=lfs -text
4
+ policy_net.pkl filter=lfs diff=lfs merge=lfs -text
5
+ policy_net.pth filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ import pandas as pd
5
+ from collections import Counter
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from torch.utils.data import Dataset, DataLoader
8
+ import pickle
9
+ import re
10
+ from nltk.corpus import stopwords
11
+ from nltk.stem import WordNetLemmatizer
12
+ import gradio as gr
13
+ import os
14
+ import nltk
15
+
16
+ # Download NLTK resources
17
+ nltk.download("stopwords", quiet=True)
18
+ nltk.download("wordnet", quiet=True)
19
+
20
+ # Initialize stopwords and lemmatizer globally
21
+ stop_words = set(stopwords.words("english"))
22
+ lemmatizer = WordNetLemmatizer()
23
+
24
+ # Device configuration
25
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
26
+
27
+ # Dataset Class
28
+ class AmazonReviewDataset(Dataset):
29
+ def __init__(self, csv_file, max_length=50, sample_fraction=0.01, max_vocab_size=5000):
30
+ # Load dataset
31
+ print("Loading dataset from:", csv_file)
32
+ self.data = pd.read_csv(csv_file, header=None, names=["label", "title", "text"])
33
+ self.data = self.data.sample(frac=sample_fraction, random_state=42).reset_index(drop=True)
34
+ print(f"Using {len(self.data)} samples ({sample_fraction * 100:.2f}% of the dataset).")
35
+
36
+ # Clean text data
37
+ self.data["text"] = self.data["text"].apply(self.clean_text)
38
+
39
+ # Parameters
40
+ self.max_length = max_length
41
+ self.vocab = {"<PAD>": 0, "<UNK>": 1}
42
+ self.label_encoder = LabelEncoder()
43
+
44
+ # Build vocabulary
45
+ print("Building vocabulary...")
46
+ self._build_vocab(max_vocab_size)
47
+ print("Vocabulary built successfully.")
48
+
49
+ # Fit the label encoder
50
+ self.label_encoder.fit(self.data["label"])
51
+
52
+ def clean_text(self, text):
53
+ # Remove special characters and numbers
54
+ text = re.sub(r"[^a-zA-Z\s]", "", text)
55
+ # Convert to lowercase
56
+ text = text.lower()
57
+ # Remove stopwords
58
+ text = " ".join([word for word in text.split() if word not in stop_words])
59
+ # Apply lemmatization
60
+ text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
61
+ return text
62
+
63
+ def _build_vocab(self, max_vocab_size):
64
+ # Combine title and text columns
65
+ all_text = self.data["title"].astype(str) + " " + self.data["text"].astype(str)
66
+ all_text = all_text.fillna("") # Ensure no NaN values
67
+ all_text = all_text[:50000] # Use only the first 50,000 rows
68
+
69
+ # Tokenize and build vocabulary in smaller chunks
70
+ token_counts = Counter()
71
+ chunk_size = 5000 # Process smaller chunks
72
+ for i in range(0, len(all_text), chunk_size):
73
+ chunk = all_text[i:i + chunk_size]
74
+ tokens = " ".join(chunk).split() # Tokenize the chunk
75
+ token_counts.update(tokens)
76
+ print(f"Processed {min(i + chunk_size, len(all_text))} rows...")
77
+
78
+ # Keep only the most common tokens
79
+ most_common_tokens = [token for token, _ in token_counts.most_common(max_vocab_size)]
80
+ for token in most_common_tokens:
81
+ self.vocab[token] = len(self.vocab)
82
+
83
+ def __len__(self):
84
+ return len(self.data)
85
+
86
+ def __getitem__(self, idx):
87
+ label = self.data.iloc[idx]["label"]
88
+ title = str(self.data.iloc[idx]["title"])
89
+ text = str(self.data.iloc[idx]["text"])
90
+ combined_text = title + " " + text # Concatenate title and text
91
+ tokens = combined_text.split()[:self.max_length] # Tokenize and truncate
92
+ token_ids = [self.vocab.get(token, self.vocab["<UNK>"]) for token in tokens] # Convert tokens to IDs
93
+ padding = [self.vocab["<PAD>"]] * (self.max_length - len(token_ids)) # Add padding
94
+ token_ids += padding
95
+ label_encoded = self.label_encoder.transform([label])[0] # Encode label
96
+ return torch.tensor(token_ids, dtype=torch.long).to(device), torch.tensor(label_encoded, dtype=torch.long).to(device)
97
+
98
+
99
+ # Policy Network
100
+ class PolicyNetwork(nn.Module):
101
+ def __init__(self, vocab_size, embed_dim=32, hidden_dim=128, num_classes=2):
102
+ super(PolicyNetwork, self).__init__()
103
+ self.embedding = nn.Embedding(vocab_size, embed_dim)
104
+ self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
105
+ self.fc = nn.Linear(hidden_dim * 2, num_classes) # Bidirectional LSTM doubles hidden size
106
+
107
+ def forward(self, x):
108
+ embedded = self.embedding(x)
109
+ lstm_out, _ = self.lstm(embedded)
110
+ out = self.fc(lstm_out[:, -1, :]) # Use the last hidden state
111
+ return out
112
+
113
+
114
+ # Training Function
115
+ def train_rl_model(dataset, policy_net, optimizer, num_episodes=3, entropy_weight=0.01, lr=0.001, batch_size=16):
116
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)
117
+ for episode in range(num_episodes):
118
+ print(f"Episode {episode + 1} started.")
119
+ total_reward = 0
120
+ for batch in dataloader:
121
+ tokenized_reviews, true_labels = batch
122
+ logits = policy_net(tokenized_reviews)
123
+ probs = torch.softmax(logits, dim=-1)
124
+ actions = torch.multinomial(probs, 1).squeeze()
125
+
126
+ # Define rewards based on correctness
127
+ rewards = [1 if action == label else -1 for action, label in zip(actions, true_labels)]
128
+ rewards_tensor = torch.tensor(rewards, dtype=torch.float32).to(device)
129
+ rewards_tensor = (rewards_tensor - rewards_tensor.mean()) / (rewards_tensor.std() + 1e-8) # Normalize rewards
130
+
131
+ # Compute loss
132
+ loss = 0
133
+ entropy_loss = 0
134
+ for i, action in enumerate(actions):
135
+ log_prob = torch.log(probs[i, action] + 1e-8)
136
+ loss += -log_prob * rewards_tensor[i]
137
+ entropy_loss += -(probs[i] * torch.log(probs[i] + 1e-8)).sum()
138
+
139
+ loss += entropy_weight * entropy_loss
140
+
141
+ # Backpropagation
142
+ optimizer.zero_grad()
143
+ loss.backward()
144
+ torch.nn.utils.clip_grad_norm_(policy_net.parameters(), max_norm=1.0)
145
+ optimizer.step()
146
+
147
+ total_reward += sum(rewards)
148
+
149
+ print(f"Episode {episode + 1}, Total Reward: {total_reward}, Loss: {loss.item()}")
150
+
151
+ # Save the trained model
152
+ with open("policy_net.pkl", "wb") as f:
153
+ pickle.dump(policy_net.state_dict(), f)
154
+ print("Model saved successfully as policy_net.pkl")
155
+
156
+
157
+ # Evaluation Function
158
+ def evaluate_model(dataset, policy_net):
159
+ dataloader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=4)
160
+ correct = 0
161
+ total = 0
162
+ policy_net.eval()
163
+ with torch.no_grad():
164
+ for batch in dataloader:
165
+ tokenized_reviews, true_labels = batch
166
+ logits = policy_net(tokenized_reviews)
167
+ probs = torch.softmax(logits, dim=-1)
168
+ predicted_classes = torch.argmax(probs, dim=-1)
169
+ correct += (predicted_classes == true_labels).sum().item()
170
+ total += true_labels.size(0)
171
+ accuracy = correct / total
172
+ print(f"Accuracy: {accuracy * 100:.2f}%")
173
+ return accuracy
174
+
175
+
176
+ # Prediction Function for Gradio
177
+ def predict_review(review_text):
178
+ with open("vocab.pkl", "rb") as f:
179
+ vocab = pickle.load(f)
180
+ with open("label_encoder.pkl", "rb") as f:
181
+ label_encoder = pickle.load(f)
182
+
183
+ tokenized_input = review_text.split()[:50] # Limit to max length
184
+ token_ids = [vocab.get(word, vocab["<UNK>"]) for word in tokenized_input]
185
+ padding = [vocab["<PAD>"]] * (50 - len(token_ids)) # Pad if shorter than max length
186
+ token_ids += padding
187
+ token_ids = torch.tensor(token_ids).unsqueeze(0).to(device)
188
+
189
+ policy_net = PolicyNetwork(len(vocab), embed_dim=32, hidden_dim=128, num_classes=2).to(device)
190
+ with open("policy_net.pkl", "rb") as f:
191
+ policy_net.load_state_dict(pickle.load(f))
192
+ policy_net.eval()
193
+
194
+ with torch.no_grad():
195
+ logits = policy_net(token_ids)
196
+ probs = torch.softmax(logits, dim=-1)
197
+ predicted_class = torch.argmax(probs, dim=-1).item()
198
+ predicted_label = label_encoder.inverse_transform([predicted_class])[0]
199
+ return predicted_label
200
+
201
+
202
+ # Main Program
203
+ if __name__ == "__main__":
204
+ train_csv_path = r"D:\b\train.csv"
205
+ test_csv_path = r"D:\b\test.csv"
206
+ sample_fraction = 0.01
207
+ max_vocab_size = 5000
208
+ num_episodes = 3
209
+ batch_size = 16
210
+ lr = 0.001
211
+ entropy_weight = 0.01
212
+
213
+ # Initialize datasets
214
+ train_dataset = AmazonReviewDataset(train_csv_path, sample_fraction=sample_fraction, max_vocab_size=max_vocab_size)
215
+ test_dataset = AmazonReviewDataset(test_csv_path, sample_fraction=sample_fraction, max_vocab_size=max_vocab_size)
216
+ print("Dataset loaded successfully.")
217
+
218
+ # Initialize model and optimizer
219
+ policy_net = PolicyNetwork(len(train_dataset.vocab), embed_dim=32, hidden_dim=128, num_classes=2).to(device)
220
+ optimizer = optim.Adam(policy_net.parameters(), lr=lr)
221
+
222
+ # Train the model
223
+ train_rl_model(train_dataset, policy_net, optimizer, num_episodes=num_episodes, entropy_weight=entropy_weight, lr=lr, batch_size=batch_size)
224
+
225
+ # Evaluate the model
226
+ evaluate_model(test_dataset, policy_net)
227
+
228
+ # Save vocabulary and label encoder
229
+ with open("vocab.pkl", "wb") as f:
230
+ pickle.dump(train_dataset.vocab, f)
231
+ with open("label_encoder.pkl", "wb") as f:
232
+ pickle.dump(train_dataset.label_encoder, f)
233
+ print("Vocabulary and label encoder saved successfully.")
234
+
235
+ # Launch Gradio interface
236
+ iface = gr.Interface(
237
+ fn=predict_review,
238
+ inputs="text",
239
+ outputs="text",
240
+ title="Amazon Review Sentiment Analysis",
241
+ description="Enter a review to predict its sentiment (Positive/Negative)." )
242
+
243
+ iface.launch(share=True)
label_encoder.pkl ADDED
Binary file (257 Bytes). View file
 
policy_net.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cf5ca063f35b94a4c05a40388b319941d914276000fc385f7443ecf524d5095
3
+ size 1309513
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+ torch
3
+ pandas
4
+ scikit-learn
5
+ nltk
6
+ gradio
7
+ huggingface_hub