vinay0123 commited on
Commit
89c24a5
·
verified ·
1 Parent(s): 8cbebad

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +325 -0
app.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import textwrap
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.optim as optim
6
+ import spacy
7
+ import random
8
+ import pandas as pd
9
+ from torch.utils.data import Dataset, DataLoader
10
+ from torch.nn.utils.rnn import pad_sequence
11
+ from sklearn.model_selection import train_test_split
12
+ from flask import Flask ,request, jsonify,send_file,after_this_request
13
+ from collections import Counter
14
+ from flask_cors import CORS
15
+ import requests
16
+ from gtts import gTTS
17
+ from googletrans import Translator
18
+ import uuid
19
+ import os
20
+ # Load Dataset
21
+ df = pd.read_csv("https://drive.google.com/uc?id=1RCZShB5ohy1HdU-mogcP16TbeVv9txpY")
22
+ df = df.dropna(subset=['instruction', 'response'])
23
+
24
+ # Ensure all entries are strings
25
+ df['instruction'] = df['instruction'].astype(str)
26
+ df['response'] = df['response'].astype(str)
27
+ # Tokenizer (Scratch)
28
+ class ScratchTokenizer:
29
+ def __init__(self):
30
+ self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
31
+ self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
32
+ self.vocab_size = 4
33
+
34
+ def build_vocab(self, texts):
35
+ for text in texts:
36
+ for word in text.split():
37
+ if word not in self.word2idx:
38
+ self.word2idx[word] = self.vocab_size
39
+ self.idx2word[self.vocab_size] = word
40
+ self.vocab_size += 1
41
+
42
+ def encode(self, text, max_len=200):
43
+ tokens = [self.word2idx.get(word, 3) for word in text.split()]
44
+ tokens = [1] + tokens[:max_len - 2] + [2]
45
+ return tokens + [0] * (max_len - len(tokens))
46
+
47
+ def decode(self, tokens):
48
+ return " ".join([self.idx2word.get(idx, "<UNK>") for idx in tokens if idx > 0])
49
+
50
+ # Train-Test Split
51
+ train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
52
+
53
+ # Initialize Tokenizer
54
+ tokenizer = ScratchTokenizer()
55
+ tokenizer.build_vocab(train_data["instruction"].tolist() + train_data["response"].tolist())
56
+
57
+ # Dataset Class
58
+ class TextDataset(Dataset):
59
+ def __init__(self, data, tokenizer, max_len=200):
60
+ self.data = data
61
+ self.tokenizer = tokenizer
62
+ self.max_len = max_len
63
+
64
+ def __len__(self):
65
+ return len(self.data)
66
+
67
+ def __getitem__(self, idx):
68
+ src_text = self.data.iloc[idx]["instruction"]
69
+ tgt_text = self.data.iloc[idx]["response"]
70
+ src = torch.tensor(self.tokenizer.encode(src_text), dtype=torch.long)
71
+ tgt = torch.tensor(self.tokenizer.encode(tgt_text), dtype=torch.long)
72
+ return src, tgt
73
+
74
+ # Load Dataset
75
+ train_dataset = TextDataset(train_data, tokenizer)
76
+ test_dataset = TextDataset(test_data, tokenizer)
77
+ train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
78
+ test_loader = DataLoader(test_dataset, batch_size=8)
79
+
80
+ # Improved GPT-Style Transformer Model
81
+
82
+ class GPTModel(nn.Module):
83
+ def __init__(self, vocab_size, embed_size=256, num_heads=8, num_layers=6, max_len=200):
84
+ super(GPTModel, self).__init__()
85
+ self.embedding = nn.Embedding(vocab_size, embed_size)
86
+ self.pos_embedding = nn.Parameter(torch.randn(1, max_len, embed_size))
87
+ # The problem was here, setting num_encoder_layers to 0
88
+ # makes the model try to access a non-existent layer.
89
+ # The solution is to remove the encoder completely.
90
+ self.transformer = nn.TransformerDecoder(nn.TransformerDecoderLayer(d_model=embed_size, nhead=num_heads), num_layers=num_layers)
91
+ self.fc_out = nn.Linear(embed_size, vocab_size)
92
+
93
+ def forward(self, src, tgt):
94
+ src_emb = self.embedding(src) + self.pos_embedding[:, :src.size(1), :]
95
+ tgt_emb = self.embedding(tgt) + self.pos_embedding[:, :tgt.size(1), :]
96
+
97
+ # Causal Mask for Auto-Regressive Decoding
98
+ tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
99
+ output = self.transformer(tgt_emb.permute(1, 0, 2), src_emb.permute(1, 0, 2), tgt_mask=tgt_mask)
100
+ return self.fc_out(output.permute(1, 0, 2))
101
+
102
+ # Initialize Model
103
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
104
+ model = GPTModel(tokenizer.vocab_size).to(device)
105
+ optimizer = optim.AdamW(model.parameters(), lr=2e-4)
106
+ criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
107
+
108
+
109
+ def load_model(model, path="gpt_model.pth"):
110
+ if os.path.exists(path):
111
+ model.load_state_dict(torch.load(path, map_location=device))
112
+ model.eval()
113
+ print("Model loaded successfully.")
114
+ else:
115
+ print("Model file not found!")
116
+
117
+ load_model(model)
118
+
119
+ # Generate Response
120
+ def generate_response(model, query, max_length=200):
121
+ model.eval()
122
+ with torch.no_grad(): # Disable gradient tracking
123
+ src = torch.tensor(tokenizer.encode(query)).unsqueeze(0).to(device)
124
+ tgt = torch.tensor([[1]]).to(device) # <SOS>
125
+
126
+ for _ in range(max_length):
127
+ output = model(src, tgt)
128
+ next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
129
+ tgt = torch.cat([tgt, next_token], dim=1)
130
+ if next_token.item() == 2: # <EOS>
131
+ break
132
+
133
+ return tokenizer.decode(tgt.squeeze(0).tolist())
134
+
135
+
136
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
137
+ MAX_LEN = 350
138
+ BATCH_SIZE = 8
139
+ EMB_SIZE = 128
140
+ NHEAD = 8
141
+ FFN_HID_DIM = 256
142
+ NUM_ENCODER_LAYERS = 4
143
+ NUM_DECODER_LAYERS = 4
144
+ NUM_EPOCHS = 18
145
+ MIN_FREQ = 2
146
+
147
+ # ==== Tokenizers ====
148
+ spacy_eng = spacy.load("en_core_web_sm")
149
+ def tokenize_en(text):
150
+ return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
151
+
152
+ def tokenize_te(text):
153
+ return text.strip().split(" ")
154
+
155
+ # ==== Vocab Builder ====
156
+ def build_vocab(sentences, tokenizer, min_freq):
157
+ counter = Counter()
158
+ for sent in sentences:
159
+ counter.update(tokenizer(sent))
160
+ vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
161
+ for word, freq in counter.items():
162
+ if freq >= min_freq:
163
+ vocab[word] = len(vocab)
164
+ return vocab
165
+
166
+ # ==== Dataset ====
167
+ class TranslationDataset(Dataset):
168
+ def __init__(self, df, en_vocab, te_vocab):
169
+ self.data = df
170
+ self.en_vocab = en_vocab
171
+ self.te_vocab = te_vocab
172
+
173
+ def __len__(self):
174
+ return len(self.data)
175
+
176
+ def __getitem__(self, idx):
177
+ en = self.data.iloc[idx]['response']
178
+ te = self.data.iloc[idx]['translated_response']
179
+
180
+ en_tokens = ['<sos>'] + tokenize_en(en) + ['<eos>']
181
+ te_tokens = ['<sos>'] + tokenize_te(te) + ['<eos>']
182
+
183
+ en_ids = [self.en_vocab.get(tok, self.en_vocab['<unk>']) for tok in en_tokens]
184
+ te_ids = [self.te_vocab.get(tok, self.te_vocab['<unk>']) for tok in te_tokens]
185
+
186
+ return torch.tensor(en_ids), torch.tensor(te_ids)
187
+
188
+ # ==== Collate Function ====
189
+ def collate_fn(batch):
190
+ src_batch, tgt_batch = zip(*batch)
191
+ src_batch = pad_sequence(src_batch, padding_value=en_vocab['<pad>'], batch_first=True)
192
+ tgt_batch = pad_sequence(tgt_batch, padding_value=te_vocab['<pad>'], batch_first=True)
193
+ return src_batch, tgt_batch
194
+
195
+ # ==== Transformer Model ====
196
+ class Seq2SeqTransformer(nn.Module):
197
+ def __init__(self, num_encoder_layers, num_decoder_layers,
198
+ emb_size, src_vocab_size, tgt_vocab_size,
199
+ nhead, dim_feedforward=512, dropout=0.1):
200
+ super().__init__()
201
+ self.transformer = nn.Transformer(d_model=emb_size, nhead=nhead,
202
+ num_encoder_layers=num_encoder_layers,
203
+ num_decoder_layers=num_decoder_layers,
204
+ dim_feedforward=dim_feedforward, dropout=dropout)
205
+ self.src_tok_emb = nn.Embedding(src_vocab_size, emb_size)
206
+ self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, emb_size)
207
+ self.fc_out = nn.Linear(emb_size, tgt_vocab_size)
208
+ self.dropout = nn.Dropout(dropout)
209
+
210
+ def forward(self, src, tgt):
211
+ src_mask = self.transformer.generate_square_subsequent_mask(src.size(1)).to(DEVICE)
212
+ tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(DEVICE)
213
+
214
+ src_emb = self.dropout(self.src_tok_emb(src))
215
+ tgt_emb = self.dropout(self.tgt_tok_emb(tgt))
216
+ outs = self.transformer(src_emb.permute(1,0,2), tgt_emb.permute(1,0,2),
217
+ src_mask=src_mask, tgt_mask=tgt_mask)
218
+ return self.fc_out(outs.permute(1,0,2))
219
+
220
+ def translate(model, sentence, en_vocab, te_vocab, te_inv_vocab, max_len=MAX_LEN):
221
+ model.eval()
222
+ tokens = ['<sos>'] + tokenize_en(sentence) + ['<eos>']
223
+ src_ids = torch.tensor([[en_vocab.get(t, en_vocab['<unk>']) for t in tokens]]).to(DEVICE)
224
+ tgt_ids = torch.tensor([[te_vocab['<sos>']]]).to(DEVICE)
225
+
226
+ for i in range(max_len):
227
+ out = model(src_ids, tgt_ids)
228
+ next_token = out.argmax(-1)[:, -1].item()
229
+ tgt_ids = torch.cat([tgt_ids, torch.tensor([[next_token]]).to(DEVICE)], dim=1)
230
+ if next_token == te_vocab['<eos>']:
231
+ break
232
+
233
+ translated = [te_inv_vocab[idx.item()] for idx in tgt_ids[0][1:]]
234
+ return ' '.join(translated[:-1]) if translated[-1] == '<eos>' else ' '.join(translated)
235
+
236
+ # ==== Load Data ====
237
+ df_telugu = pd.read_csv("merged_translated_responses.csv") # columns: 'en', 'te'
238
+ # Clean NaN or non-string entries
239
+ df_telugu = df_telugu.dropna(subset=['response', 'translated_response'])
240
+
241
+ # Ensure all entries are strings
242
+ df_telugu['response'] = df_telugu['response'].astype(str)
243
+ df_telugu['translated_response'] = df_telugu['translated_response'].astype(str)
244
+
245
+ # Build vocabularies
246
+ en_vocab = build_vocab(df_telugu['response'], tokenize_en, MIN_FREQ)
247
+ te_vocab = build_vocab(df_telugu['translated_response'], tokenize_te, MIN_FREQ)
248
+ te_inv_vocab = {idx: tok for tok, idx in te_vocab.items()}
249
+
250
+ # Prepare Dataset & DataLoader
251
+ dataset = TranslationDataset(df_telugu, en_vocab, te_vocab)
252
+ dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
253
+
254
+ # Initialize Model
255
+ # model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
256
+ # len(en_vocab), len(te_vocab), NHEAD, FFN_HID_DIM).to(DEVICE)
257
+
258
+ pad_idx = te_vocab['<pad>']
259
+ criterion_telugu = nn.CrossEntropyLoss(ignore_index=pad_idx)
260
+ optimizer_telugu = optim.Adam(model.parameters(), lr=0.0005)
261
+
262
+ # ==== Training ====
263
+ # for epoch in range(NUM_EPOCHS):
264
+ # loss = train(model, dataloader, optimizer, criterion)
265
+ # print(f"Epoch {epoch+1}, Loss: {loss:.4f}")
266
+
267
+ # ==== Try Translation ====
268
+
269
+ model_telugu = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,len(en_vocab), len(te_vocab), NHEAD, FFN_HID_DIM).to(DEVICE)
270
+
271
+ # Load saved weights
272
+ model_telugu.load_state_dict(torch.load("english_telugu_transformer.pth",map_location = torch.device('cpu')))
273
+ model_telugu.eval()
274
+ app=Flask(__name__)
275
+ CORS(app)
276
+
277
+ @app.route("/intent")
278
+ def home():
279
+ return jsonify({"intents" :list(set(df['intent'].dropna()))})
280
+
281
+ @app.route("/query", methods=["POST"])
282
+ def query_model():
283
+ global audio_telugu_response
284
+ data = request.get_json()
285
+ query = data.get("query", "")
286
+
287
+ if not query:
288
+ return jsonify({"error": "Query cannot be empty"}), 400
289
+
290
+ # Assuming `generate_response` is a function that processes the query
291
+ response = generate_response(model, query)
292
+ def clean_response(response):
293
+ return response.replace("<EOS>", "").replace("<SOS>", "").strip()
294
+ response=clean_response(response)
295
+ telugu_response = translate(model_telugu, response, en_vocab, te_vocab, te_inv_vocab)
296
+ audio_telugu_response=telugu_response
297
+ return jsonify({"telugu":(telugu_response),"english":(response)})
298
+ @app.route("/audio", methods=["POST"])
299
+ def get_audio():
300
+ data = request.get_json()
301
+ text = data.get("text")
302
+
303
+ # text=audio_telugu_response
304
+ if not text:
305
+ return jsonify({"error": "No Response To convert to speech"}), 400
306
+
307
+ filename = f"speech_{uuid.uuid4().hex}.mp3"
308
+ filepath = os.path.join("audio_temp", filename)
309
+
310
+ os.makedirs("audio_temp", exist_ok=True)
311
+
312
+ # Convert text to Telugu speech
313
+ speech = gTTS(text=text, lang="te")
314
+ speech.save(filepath)
315
+
316
+ # Automatically delete the file after sending
317
+ @after_this_request
318
+ def cleanup(response):
319
+ try:
320
+ os.remove(filepath)
321
+ except Exception as e:
322
+ print(f"Cleanup error: {e}")
323
+ return response
324
+
325
+ return send_file(filepath, mimetype="audio/mpeg", as_attachment=False)