vinay0123 commited on
Commit
d80173a
·
verified ·
1 Parent(s): 2a188be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -107
app.py CHANGED
@@ -1,123 +1,185 @@
 
1
  import torch
2
  import torch.nn as nn
 
 
 
3
  import pandas as pd
4
- from torch.utils.data import Dataset
 
5
  from sklearn.model_selection import train_test_split
6
- from fastapi import FastAPI
7
- from pydantic import BaseModel
8
- from fastapi.responses import JSONResponse
9
- import os
10
 
11
- # Load data
12
- url = "https://drive.google.com/uc?id=1RCZShB5ohy1HdU-mogcP16TbeVv9txpY"
13
- df = pd.read_csv(url)
14
-
15
- # Tokenizer
16
- class ScratchTokenizer:
17
- def __init__(self):
18
- self.word2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
19
- self.idx2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
20
- self.vocab_size = 4
21
-
22
- def build_vocab(self, texts):
23
- for text in texts:
24
- for word in text.split():
25
- if word not in self.word2idx:
26
- self.word2idx[word] = self.vocab_size
27
- self.idx2word[self.vocab_size] = word
28
- self.vocab_size += 1
29
-
30
- def encode(self, text, max_len=200):
31
- tokens = [self.word2idx.get(word, 3) for word in text.split()]
32
- tokens = [1] + tokens[:max_len - 2] + [2]
33
- return tokens + [0] * (max_len - len(tokens))
34
-
35
- def decode(self, tokens):
36
- return " ".join([self.idx2word.get(idx, "<UNK>") for idx in tokens if idx > 0])
37
-
38
- # Train-Test Split
39
- train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
40
-
41
- # Initialize Tokenizer
42
- tokenizer = ScratchTokenizer()
43
- tokenizer.build_vocab(train_data["instruction"].tolist() + train_data["response"].tolist())
44
-
45
- # Dataset Class (not used in inference but useful for training)
46
- class TextDataset(Dataset):
47
- def __init__(self, data, tokenizer, max_len=200):
48
- self.data = data
49
- self.tokenizer = tokenizer
50
- self.max_len = max_len
 
51
 
52
  def __len__(self):
53
  return len(self.data)
54
 
55
  def __getitem__(self, idx):
56
- src_text = self.data.iloc[idx]["instruction"]
57
- tgt_text = self.data.iloc[idx]["response"]
58
- src = torch.tensor(self.tokenizer.encode(src_text), dtype=torch.long)
59
- tgt = torch.tensor(self.tokenizer.encode(tgt_text), dtype=torch.long)
60
- return src, tgt
61
-
62
- # Model
63
- class GPTModel(nn.Module):
64
- def __init__(self, vocab_size, embed_size=256, num_heads=8, num_layers=6, max_len=200):
65
- super(GPTModel, self).__init__()
66
- self.embedding = nn.Embedding(vocab_size, embed_size)
67
- self.pos_embedding = nn.Parameter(torch.randn(1, max_len, embed_size))
68
- self.transformer = nn.TransformerDecoder(
69
- nn.TransformerDecoderLayer(d_model=embed_size, nhead=num_heads),
70
- num_layers=num_layers
71
- )
72
- self.fc_out = nn.Linear(embed_size, vocab_size)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  def forward(self, src, tgt):
75
- src_emb = self.embedding(src) + self.pos_embedding[:, :src.size(1), :]
76
- tgt_emb = self.embedding(tgt) + self.pos_embedding[:, :tgt.size(1), :]
77
- tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
78
- output = self.transformer(tgt_emb.permute(1, 0, 2), src_emb.permute(1, 0, 2), tgt_mask=tgt_mask)
79
- return self.fc_out(output.permute(1, 0, 2))
80
-
81
- # Load model
82
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
83
- model = GPTModel(tokenizer.vocab_size).to(device)
84
-
85
- def load_model(model, path="gpt_model.pth"):
86
- if os.path.exists(path):
87
- model.load_state_dict(torch.load(path, map_location=device))
88
- model.eval()
89
- print("Model loaded successfully.")
90
- else:
91
- print("Model file not found!")
92
-
93
- load_model(model)
94
-
95
- # Generate Response
96
- def generate_response(model, query, max_length=200):
97
- model.eval()
98
- src = torch.tensor(tokenizer.encode(query)).unsqueeze(0).to(device)
99
- tgt = torch.tensor([[1]]).to(device) # <SOS>
100
- for _ in range(max_length):
101
- output = model(src, tgt)
102
- next_word = output.argmax(-1)[:, -1].unsqueeze(1)
103
- tgt = torch.cat([tgt, next_word], dim=1)
104
- if next_word.item() == 2: # <EOS>
105
- break
106
- return tokenizer.decode(tgt.squeeze(0).tolist())
107
 
108
- # FastAPI app
109
- app = FastAPI()
 
 
 
110
 
111
- class Query(BaseModel):
112
- query: str
 
 
 
 
 
 
 
 
 
 
113
 
114
- @app.get("/")
115
- async def root():
116
- return {"message": "Transformer-based Response Generator API is running!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- @app.post("/query")
119
- async def query_model(query: Query):
120
- if not query.query.strip():
121
- return JSONResponse(status_code=400, content={"error": "Query cannot be empty"})
122
- response = generate_response(model, query.query)
123
- return {"query": query.query, "response": response}
 
1
+ import textwrap
2
  import torch
3
  import torch.nn as nn
4
+ import torch.optim as optim
5
+ import spacy
6
+ import random
7
  import pandas as pd
8
+ from torch.utils.data import Dataset, DataLoader
9
+ from torch.nn.utils.rnn import pad_sequence
10
  from sklearn.model_selection import train_test_split
11
+ from flask import Flask ,request, jsonify,send_file,after_this_request
12
+ from collections import Counter
13
+ from flask_cors import CORS
14
+ import requests
15
 
16
+ import uuid
17
+ import os
18
+ import time
19
+
20
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
+ MAX_LEN = 350
22
+ BATCH_SIZE = 8
23
+ EMB_SIZE = 128
24
+ NHEAD = 8
25
+ FFN_HID_DIM = 256
26
+ NUM_ENCODER_LAYERS = 4
27
+ NUM_DECODER_LAYERS = 4
28
+ NUM_EPOCHS = 18
29
+ MIN_FREQ = 2
30
+ PORT = 7680
31
+
32
+ # ==== Tokenizers ====
33
+ spacy_eng = spacy.load("en_core_web_sm")
34
+ def tokenize_en(text):
35
+ return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
36
+
37
+ def tokenize_te(text):
38
+ return text.strip().split(" ")
39
+
40
+ # ==== Vocab Builder ====
41
+ def build_vocab(sentences, tokenizer, min_freq):
42
+ counter = Counter()
43
+ for sent in sentences:
44
+ counter.update(tokenizer(sent))
45
+ vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
46
+ for word, freq in counter.items():
47
+ if freq >= min_freq:
48
+ vocab[word] = len(vocab)
49
+ return vocab
50
+
51
+ # ==== Dataset ====
52
+ class TranslationDataset(Dataset):
53
+ def __init__(self, df, en_vocab, te_vocab):
54
+ self.data = df
55
+ self.en_vocab = en_vocab
56
+ self.te_vocab = te_vocab
57
 
58
  def __len__(self):
59
  return len(self.data)
60
 
61
  def __getitem__(self, idx):
62
+ en = self.data.iloc[idx]['response']
63
+ te = self.data.iloc[idx]['translated_response']
64
+
65
+ en_tokens = ['<sos>'] + tokenize_en(en) + ['<eos>']
66
+ te_tokens = ['<sos>'] + tokenize_te(te) + ['<eos>']
67
+
68
+ en_ids = [self.en_vocab.get(tok, self.en_vocab['<unk>']) for tok in en_tokens]
69
+ te_ids = [self.te_vocab.get(tok, self.te_vocab['<unk>']) for tok in te_tokens]
70
+
71
+ return torch.tensor(en_ids), torch.tensor(te_ids)
72
+
73
+ # ==== Collate Function ====
74
+ def collate_fn(batch):
75
+ src_batch, tgt_batch = zip(*batch)
76
+ src_batch = pad_sequence(src_batch, padding_value=en_vocab['<pad>'], batch_first=True)
77
+ tgt_batch = pad_sequence(tgt_batch, padding_value=te_vocab['<pad>'], batch_first=True)
78
+ return src_batch, tgt_batch
79
+
80
+ # ==== Transformer Model ====
81
+ class Seq2SeqTransformer(nn.Module):
82
+ def __init__(self, num_encoder_layers, num_decoder_layers,
83
+ emb_size, src_vocab_size, tgt_vocab_size,
84
+ nhead, dim_feedforward=512, dropout=0.1):
85
+ super().__init__()
86
+ self.transformer = nn.Transformer(d_model=emb_size, nhead=nhead,
87
+ num_encoder_layers=num_encoder_layers,
88
+ num_decoder_layers=num_decoder_layers,
89
+ dim_feedforward=dim_feedforward, dropout=dropout)
90
+ self.src_tok_emb = nn.Embedding(src_vocab_size, emb_size)
91
+ self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, emb_size)
92
+ self.fc_out = nn.Linear(emb_size, tgt_vocab_size)
93
+ self.dropout = nn.Dropout(dropout)
94
 
95
  def forward(self, src, tgt):
96
+ src_mask = self.transformer.generate_square_subsequent_mask(src.size(1)).to(DEVICE)
97
+ tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(DEVICE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ src_emb = self.dropout(self.src_tok_emb(src))
100
+ tgt_emb = self.dropout(self.tgt_tok_emb(tgt))
101
+ outs = self.transformer(src_emb.permute(1,0,2), tgt_emb.permute(1,0,2),
102
+ src_mask=src_mask, tgt_mask=tgt_mask)
103
+ return self.fc_out(outs.permute(1,0,2))
104
 
105
+ def translate(model, sentence, en_vocab, te_vocab, te_inv_vocab, max_len=MAX_LEN):
106
+ model.eval()
107
+ tokens = ['<sos>'] + tokenize_en(sentence) + ['<eos>']
108
+ src_ids = torch.tensor([[en_vocab.get(t, en_vocab['<unk>']) for t in tokens]]).to(DEVICE)
109
+ tgt_ids = torch.tensor([[te_vocab['<sos>']]]).to(DEVICE)
110
+
111
+ for i in range(max_len):
112
+ out = model(src_ids, tgt_ids)
113
+ next_token = out.argmax(-1)[:, -1].item()
114
+ tgt_ids = torch.cat([tgt_ids, torch.tensor([[next_token]]).to(DEVICE)], dim=1)
115
+ if next_token == te_vocab['<eos>']:
116
+ break
117
 
118
+ translated = [te_inv_vocab[idx.item()] for idx in tgt_ids[0][1:]]
119
+ return ' '.join(translated[:-1]) if translated[-1] == '<eos>' else ' '.join(translated)
120
+
121
+ # ==== Load Data ====
122
+ df_telugu = pd.read_csv("merged_translated_responses.csv") # columns: 'en', 'te'
123
+ # Clean NaN or non-string entries
124
+ df_telugu = df_telugu.dropna(subset=['response', 'translated_response'])
125
+
126
+ # Ensure all entries are strings
127
+ df_telugu['response'] = df_telugu['response'].astype(str)
128
+ df_telugu['translated_response'] = df_telugu['translated_response'].astype(str)
129
+
130
+ # Build vocabularies
131
+ en_vocab = build_vocab(df_telugu['response'], tokenize_en, MIN_FREQ)
132
+ te_vocab = build_vocab(df_telugu['translated_response'], tokenize_te, MIN_FREQ)
133
+ te_inv_vocab = {idx: tok for tok, idx in te_vocab.items()}
134
+
135
+ # Prepare Dataset & DataLoader
136
+ dataset = TranslationDataset(df_telugu, en_vocab, te_vocab)
137
+ dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
138
+
139
+ # Initialize Model
140
+ model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
141
+ len(en_vocab), len(te_vocab), NHEAD, FFN_HID_DIM).to(DEVICE)
142
+
143
+ pad_idx = te_vocab['<pad>']
144
+ criterion_telugu = nn.CrossEntropyLoss(ignore_index=pad_idx)
145
+ optimizer_telugu = optim.Adam(model.parameters(), lr=0.0005)
146
+
147
+ # ==== Training ====
148
+ # for epoch in range(NUM_EPOCHS):
149
+ # loss = train(model, dataloader, optimizer, criterion)
150
+ # print(f"Epoch {epoch+1}, Loss: {loss:.4f}")
151
+
152
+ # ==== Try Translation ====
153
+
154
+ model_telugu = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,len(en_vocab), len(te_vocab), NHEAD, FFN_HID_DIM).to(DEVICE)
155
+
156
+ # Load saved weights
157
+ model_telugu.load_state_dict(torch.load("english_telugu_transformer.pth",map_location = torch.device('cpu')))
158
+ model_telugu.eval()
159
+ app=Flask(__name__)
160
+ CORS(app)
161
+
162
+ @app.route("/")
163
+ def home():
164
+ return jsonify({"message": "hellooooooooo"})
165
+
166
+
167
+ @app.route("/translate", methods=["POST"])
168
+ def translate_text():
169
+ data = request.get_json()
170
+ text = data.get("text", "")
171
+ if not text:
172
+ return jsonify({"error": "Text cannot be empty"}), 400
173
+
174
+ # First generate English response
175
+ english_response = text
176
+ start=time.time()
177
+ # Then translate to Telugu
178
+ telugu_response = translate(model_telugu, english_response, en_vocab, te_vocab, te_inv_vocab)
179
+ end=time.time()
180
+ return jsonify({
181
+ "english": english_response,
182
+ "telugu": telugu_response,
183
+ "time": end-start
184
+ })
185