rishitha commited on
Commit
0c89f3d
·
verified ·
1 Parent(s): 694b473

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +246 -0
app.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ import pandas as pd
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from flask import Flask, request, jsonify
7
+ from sklearn.model_selection import train_test_split
8
+ import os
9
+
10
+ # Load data
11
+ url = "https://drive.google.com/uc?id=1RCZShB5ohy1HdU-mogcP16TbeVv9txpY"
12
+ df = pd.read_csv(url)
13
+
14
+ # Tokenizer
15
+ class ScratchTokenizer:
16
+ def _init_(self):
17
+ self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
18
+ self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
19
+ self.vocab_size = 4
20
+
21
+ def build_vocab(self, texts):
22
+ for text in texts:
23
+ for word in text.split():
24
+ if word not in self.word2idx:
25
+ self.word2idx[word] = self.vocab_size
26
+ self.idx2word[self.vocab_size] = word
27
+ self.vocab_size += 1
28
+
29
+ def encode(self, text, max_len=200):
30
+ tokens = [self.word2idx.get(word, 3) for word in text.split()]
31
+ tokens = [1] + tokens[:max_len - 2] + [2]
32
+ return tokens + [0] * (max_len - len(tokens))
33
+
34
+ def decode(self, tokens):
35
+ return " ".join([self.idx2word.get(idx, "<UNK>") for idx in tokens if idx > 0])
36
+
37
+ # Train-Test Split
38
+ train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
39
+
40
+ # Initialize Tokenizer
41
+ tokenizer = ScratchTokenizer()
42
+ tokenizer.build_vocab(train_data["instruction"].tolist() + train_data["response"].tolist())
43
+
44
+ # Dataset Class
45
+ class TextDataset(Dataset):
46
+ def _init_(self, data, tokenizer, max_len=200):
47
+ self.data = data
48
+ self.tokenizer = tokenizer
49
+ self.max_len = max_len
50
+
51
+ def _len_(self):
52
+ return len(self.data)
53
+
54
+ def _getitem_(self, idx):
55
+ src_text = self.data.iloc[idx]["instruction"]
56
+ tgt_text = self.data.iloc[idx]["response"]
57
+ src = torch.tensor(self.tokenizer.encode(src_text), dtype=torch.long)
58
+ tgt = torch.tensor(self.tokenizer.encode(tgt_text), dtype=torch.long)
59
+ return src, tgt
60
+
61
+ # Model
62
+ class GPTModel(nn.Module):
63
+ def _init_(self, vocab_size, embed_size=256, num_heads=8, num_layers=6, max_len=200):
64
+ super(GPTModel, self)._init_()
65
+ self.embedding = nn.Embedding(vocab_size, embed_size)
66
+ self.pos_embedding = nn.Parameter(torch.randn(1, max_len, embed_size))
67
+ self.transformer = nn.TransformerDecoder(
68
+ nn.TransformerDecoderLayer(d_model=embed_size, nhead=num_heads),
69
+ num_layers=num_layers
70
+ )
71
+ self.fc_out = nn.Linear(embed_size, vocab_size)
72
+
73
+ def forward(self, src, tgt):
74
+ src_emb = self.embedding(src) + self.pos_embedding[:, :src.size(1), :]
75
+ tgt_emb = self.embedding(tgt) + self.pos_embedding[:, :tgt.size(1), :]
76
+ tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
77
+ output = self.transformer(tgt_emb.permute(1, 0, 2), src_emb.permute(1, 0, 2), tgt_mask=tgt_mask)
78
+ return self.fc_out(output.permute(1, 0, 2))
79
+
80
+ # Load model
81
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
82
+ model = GPTModel(tokenizer.vocab_size).to(device)
83
+
84
+ def load_model(model, path="gpt_model.pth"):
85
+ if os.path.exists(path):
86
+ model.load_state_dict(torch.load(path, map_location=device))
87
+ model.eval()
88
+ print("Model loaded successfully.")
89
+ else:
90
+ print("Model file not found!")
91
+
92
+ load_model(model)
93
+
94
+ # Generate Response
95
+ def generate_response(model, query, max_length=200):
96
+ model.eval()
97
+ src = torch.tensor(tokenizer.encode(query)).unsqueeze(0).to(device)
98
+ tgt = torch.tensor([[1]]).to(device) # <SOS>
99
+ for _ in range(max_length):
100
+ output = model(src, tgt)
101
+ next_word = output.argmax(-1)[:, -1].unsqueeze(1)
102
+ tgt = torch.cat([tgt, next_word], dim=1)
103
+ if next_word.item() == 2: # <EOS>
104
+ break
105
+ return tokenizer.decode(tgt.squeeze(0).tolist())
106
+
107
+ # Flask App
108
+ app = Flask(_name_)
109
+
110
+ @app.route("/")
111
+ def home():
112
+ return {"message": "Transformer-based Response Generator API is running!"}
113
+
114
+ @app.route("/query", methods=["POST"])
115
+ def query_model():
116
+ data = request.get_json()
117
+ query = data.get("query", "")
118
+ if not query:
119
+ return jsonify({"error": "Query cannot be empty"}), 400
120
+ response = generate_response(model, query)
121
+ return jsonify({"query": query, "response": response})
122
+
123
+ # DO NOT ADD app.run()
124
+ import torch
125
+ import torch.nn as nn
126
+ import torch.optim as optim
127
+ import pandas as pd
128
+ from torch.utils.data import Dataset, DataLoader
129
+ from flask import Flask, request, jsonify
130
+ from sklearn.model_selection import train_test_split
131
+ import os
132
+
133
+ # Load data
134
+ url = "https://drive.google.com/uc?id=1RCZShB5ohy1HdU-mogcP16TbeVv9txpY"
135
+ df = pd.read_csv(url)
136
+
137
+ # Tokenizer
138
+ class ScratchTokenizer:
139
+ def _init_(self):
140
+ self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
141
+ self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
142
+ self.vocab_size = 4
143
+
144
+ def build_vocab(self, texts):
145
+ for text in texts:
146
+ for word in text.split():
147
+ if word not in self.word2idx:
148
+ self.word2idx[word] = self.vocab_size
149
+ self.idx2word[self.vocab_size] = word
150
+ self.vocab_size += 1
151
+
152
+ def encode(self, text, max_len=200):
153
+ tokens = [self.word2idx.get(word, 3) for word in text.split()]
154
+ tokens = [1] + tokens[:max_len - 2] + [2]
155
+ return tokens + [0] * (max_len - len(tokens))
156
+
157
+ def decode(self, tokens):
158
+ return " ".join([self.idx2word.get(idx, "<UNK>") for idx in tokens if idx > 0])
159
+
160
+ # Train-Test Split
161
+ train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
162
+
163
+ # Initialize Tokenizer
164
+ tokenizer = ScratchTokenizer()
165
+ tokenizer.build_vocab(train_data["instruction"].tolist() + train_data["response"].tolist())
166
+
167
+ # Dataset Class
168
+ class TextDataset(Dataset):
169
+ def _init_(self, data, tokenizer, max_len=200):
170
+ self.data = data
171
+ self.tokenizer = tokenizer
172
+ self.max_len = max_len
173
+
174
+ def _len_(self):
175
+ return len(self.data)
176
+
177
+ def _getitem_(self, idx):
178
+ src_text = self.data.iloc[idx]["instruction"]
179
+ tgt_text = self.data.iloc[idx]["response"]
180
+ src = torch.tensor(self.tokenizer.encode(src_text), dtype=torch.long)
181
+ tgt = torch.tensor(self.tokenizer.encode(tgt_text), dtype=torch.long)
182
+ return src, tgt
183
+
184
+ # Model
185
+ class GPTModel(nn.Module):
186
+ def _init_(self, vocab_size, embed_size=256, num_heads=8, num_layers=6, max_len=200):
187
+ super(GPTModel, self)._init_()
188
+ self.embedding = nn.Embedding(vocab_size, embed_size)
189
+ self.pos_embedding = nn.Parameter(torch.randn(1, max_len, embed_size))
190
+ self.transformer = nn.TransformerDecoder(
191
+ nn.TransformerDecoderLayer(d_model=embed_size, nhead=num_heads),
192
+ num_layers=num_layers
193
+ )
194
+ self.fc_out = nn.Linear(embed_size, vocab_size)
195
+
196
+ def forward(self, src, tgt):
197
+ src_emb = self.embedding(src) + self.pos_embedding[:, :src.size(1), :]
198
+ tgt_emb = self.embedding(tgt) + self.pos_embedding[:, :tgt.size(1), :]
199
+ tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
200
+ output = self.transformer(tgt_emb.permute(1, 0, 2), src_emb.permute(1, 0, 2), tgt_mask=tgt_mask)
201
+ return self.fc_out(output.permute(1, 0, 2))
202
+
203
+ # Load model
204
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
205
+ model = GPTModel(tokenizer.vocab_size).to(device)
206
+
207
+ def load_model(model, path="gpt_model.pth"):
208
+ if os.path.exists(path):
209
+ model.load_state_dict(torch.load(path, map_location=device))
210
+ model.eval()
211
+ print("Model loaded successfully.")
212
+ else:
213
+ print("Model file not found!")
214
+
215
+ load_model(model)
216
+
217
+ # Generate Response
218
+ def generate_response(model, query, max_length=200):
219
+ model.eval()
220
+ src = torch.tensor(tokenizer.encode(query)).unsqueeze(0).to(device)
221
+ tgt = torch.tensor([[1]]).to(device) # <SOS>
222
+ for _ in range(max_length):
223
+ output = model(src, tgt)
224
+ next_word = output.argmax(-1)[:, -1].unsqueeze(1)
225
+ tgt = torch.cat([tgt, next_word], dim=1)
226
+ if next_word.item() == 2: # <EOS>
227
+ break
228
+ return tokenizer.decode(tgt.squeeze(0).tolist())
229
+
230
+ # Flask App
231
+ app = Flask(_name_)
232
+
233
+ @app.route("/")
234
+ def home():
235
+ return {"message": "Transformer-based Response Generator API is running!"}
236
+
237
+ @app.route("/query", methods=["POST"])
238
+ def query_model():
239
+ data = request.get_json()
240
+ query = data.get("query", "")
241
+ if not query:
242
+ return jsonify({"error": "Query cannot be empty"}), 400
243
+ response = generate_response(model, query)
244
+ return jsonify({"query": query, "response": response})
245
+
246
+ # DO NOT ADD app.run()