DP27 commited on
Commit
c66a046
·
verified ·
1 Parent(s): 072e66a

Upload 5 files

Browse files
Files changed (5) hide show
  1. config.json +6 -0
  2. ma_vocab.json +157 -0
  3. main_model.py +230 -0
  4. seq2seq_model.pth +3 -0
  5. temp.py +29 -0
config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 155,
3
+ "embedding_dim": 64,
4
+ "hidden_dim": 128,
5
+ "max_len": 32
6
+ }
ma_vocab.json ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do": 4,
3
+ "medicaid": 5,
4
+ "advantage": 6,
5
+ "plans": 7,
6
+ "cover": 8,
7
+ "prescription": 9,
8
+ "drugs?": 10,
9
+ "who": 11,
10
+ "is": 12,
11
+ "eligible": 13,
12
+ "for": 14,
13
+ "a": 15,
14
+ "plan?": 16,
15
+ "can": 17,
16
+ "i": 18,
17
+ "keep": 19,
18
+ "my": 20,
19
+ "doctor": 21,
20
+ "with": 22,
21
+ "what": 23,
22
+ "the": 24,
23
+ "difference": 25,
24
+ "between": 26,
25
+ "and": 27,
26
+ "medicare?": 28,
27
+ "benefits": 29,
28
+ "provide?": 30,
29
+ "how": 31,
30
+ "enroll": 32,
31
+ "in": 33,
32
+ "are": 34,
33
+ "there": 35,
34
+ "any": 36,
35
+ "costs": 37,
36
+ "associated": 38,
37
+ "plans?": 39,
38
+ "switch": 40,
39
+ "from": 41,
40
+ "original": 42,
41
+ "medicare": 43,
42
+ "to": 44,
43
+ "does": 45,
44
+ "dental": 46,
45
+ "vision?": 47,
46
+ "yes,": 48,
47
+ "most": 49,
48
+ "include": 50,
49
+ "part": 51,
50
+ "d": 52,
51
+ "coverage,": 53,
52
+ "which": 54,
53
+ "helps": 55,
54
+ "pay": 56,
55
+ "drugs.": 57,
56
+ "individuals": 58,
57
+ "both": 59,
58
+ "b": 60,
59
+ "qualify": 61,
60
+ "their": 62,
61
+ "state": 63,
62
+ "plan.": 64,
63
+ "it": 65,
64
+ "depends": 66,
65
+ "on": 67,
66
+ "plan's": 68,
67
+ "provider": 69,
68
+ "network.": 70,
69
+ "some": 71,
70
+ "have": 72,
71
+ "preferred": 73,
72
+ "network": 74,
73
+ "of": 75,
74
+ "doctors,": 76,
75
+ "while": 77,
76
+ "others": 78,
77
+ "allow": 79,
78
+ "you": 80,
79
+ "see": 81,
80
+ "accepts": 82,
81
+ "medicaid.": 83,
82
+ "federal": 84,
83
+ "program": 85,
84
+ "people": 86,
85
+ "aged": 87,
86
+ "65+": 88,
87
+ "or": 89,
88
+ "certain": 90,
89
+ "disabilities,": 91,
90
+ "state-run": 92,
91
+ "that": 93,
92
+ "low-income": 94,
93
+ "healthcare": 95,
94
+ "costs.": 96,
95
+ "typically": 97,
96
+ "hospital": 98,
97
+ "medical": 99,
98
+ "drugs,": 100,
99
+ "dental,": 101,
100
+ "vision,": 102,
101
+ "hearing,": 103,
102
+ "transportation,": 104,
103
+ "sometimes": 105,
104
+ "additional": 106,
105
+ "like": 107,
106
+ "fitness": 108,
107
+ "programs.": 109,
108
+ "during": 110,
109
+ "medicare\u2019s": 111,
110
+ "annual": 112,
111
+ "enrollment": 113,
112
+ "period": 114,
113
+ "(aep)": 115,
114
+ "special": 116,
115
+ "(sep)": 117,
116
+ "if": 118,
117
+ "qualify.": 119,
118
+ "apply": 120,
119
+ "online,": 121,
120
+ "by": 122,
121
+ "phone,": 123,
122
+ "through": 124,
123
+ "an": 125,
124
+ "insurance": 126,
125
+ "provider.": 127,
126
+ "low": 128,
127
+ "no": 129,
128
+ "premiums.": 130,
129
+ "however,": 131,
130
+ "may": 132,
131
+ "vary": 133,
132
+ "depending": 134,
133
+ "plan,": 135,
134
+ "including": 136,
135
+ "copayments,": 137,
136
+ "deductibles,": 138,
137
+ "out-of-pocket": 139,
138
+ "expenses.": 140,
139
+ "also": 141,
140
+ "known": 142,
141
+ "as": 143,
142
+ "dual-eligible": 144,
143
+ "needs": 145,
144
+ "plan": 146,
145
+ "(d-snp),": 147,
146
+ "type": 148,
147
+ "designed": 149,
148
+ "medicare,": 150,
149
+ "period.": 151,
150
+ "many": 152,
151
+ "hearing": 153,
152
+ "coverage.": 154,
153
+ "<PAD>": 0,
154
+ "<UNK>": 1,
155
+ "<SOS>": 2,
156
+ "<EOS>": 3
157
+ }
main_model.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from collections import Counter
3
+ import torch
4
+ import torch.nn as nn
5
+ import json
6
+ from sklearn.model_selection import train_test_split
7
+
8
+
9
+ def build_vocab(texts):
10
+ vocab = Counter()
11
+ for text in texts:
12
+ vocab.update(text.lower().split())
13
+ vocab = {
14
+ word: idx + 4 for idx, word in enumerate(vocab)
15
+ } # +4 to reserve 0 for padding, 1 for unknown, 2 for <SOS>, 3 for <EOS>
16
+ vocab["<PAD>"] = 0
17
+ vocab["<UNK>"] = 1
18
+ vocab["<SOS>"] = 2
19
+ vocab["<EOS>"] = 3
20
+ with open("./model/ma_vocab.json", "w") as f:
21
+ json.dump(vocab, f, indent=4)
22
+ return vocab
23
+
24
+
25
+ # Tokenize function
26
+ def tokenize(text, vocab):
27
+ return (
28
+ [vocab["<SOS>"]]
29
+ + [vocab.get(word.lower(), vocab["<UNK>"]) for word in text.split()]
30
+ + [vocab["<EOS>"]]
31
+ )
32
+
33
+
34
+ # Pad sequences
35
+ def pad_sequences(sequences, max_len):
36
+ padded = np.zeros((len(sequences), max_len))
37
+ for i, seq in enumerate(sequences):
38
+ padded[i, : len(seq)] = seq
39
+ return padded
40
+
41
+
42
+ def evaluate_model(model, test_questions, test_answers, vocab, max_len):
43
+ correct = 0
44
+ for i in range(len(test_questions)):
45
+ question = test_questions[i]
46
+ true_answer = test_answers[i]
47
+ generated_answer = Seq2Seq.generate(model, question, vocab, max_len)
48
+ print(f"Question: {question}")
49
+ print(f"True Answer: {true_answer}")
50
+ print(f"Generated Answer: {generated_answer}")
51
+ if generated_answer.lower() == true_answer.lower():
52
+ correct += 1
53
+ accuracy = correct / len(test_questions)
54
+ return accuracy
55
+
56
+
57
+ # Define Attention Layer
58
+ class Attention(nn.Module):
59
+ def __init__(self, hidden_dim):
60
+ super(Attention, self).__init__()
61
+ self.attn = nn.Linear(hidden_dim * 2, hidden_dim) # Attention layer
62
+ self.v = nn.Parameter(torch.rand(hidden_dim)) # Weight for attention
63
+
64
+ def forward(self, hidden, encoder_outputs):
65
+ seq_len = encoder_outputs.size(1)
66
+ hidden = hidden.unsqueeze(1).repeat(
67
+ 1, seq_len, 1
68
+ ) # Repeat hidden state to match encoder output sequence length
69
+ energy = torch.tanh(
70
+ self.attn(torch.cat((hidden, encoder_outputs), dim=2))
71
+ ) # Apply attention mechanism
72
+ attention = torch.sum(self.v * energy, dim=2) # Sum across hidden dim
73
+ return torch.softmax(attention, dim=1)
74
+
75
+
76
+ # Define the Seq2Seq Model with Attention
77
+ class Seq2Seq(nn.Module):
78
+ def __init__(self, vocab_size, embedding_dim, hidden_dim):
79
+ super(Seq2Seq, self).__init__()
80
+ self.embedding = nn.Embedding(vocab_size, embedding_dim)
81
+ self.encoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
82
+ self.decoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
83
+ self.attn = Attention(hidden_dim) # Attention mechanism
84
+ self.fc = nn.Linear(hidden_dim, vocab_size)
85
+ self.dropout = nn.Dropout(0.5) # Add dropout
86
+
87
+ def forward(self, src, trg):
88
+ # Encoder
89
+ embedded_src = self.dropout(self.embedding(src))
90
+ encoder_outputs, (hidden, cell) = self.encoder(embedded_src)
91
+
92
+ # Attention (if you're using it)
93
+ attn_weights = self.attn(hidden[-1], encoder_outputs)
94
+ context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
95
+
96
+ # Decoder
97
+ embedded_trg = self.dropout(self.embedding(trg))
98
+ outputs, _ = self.decoder(embedded_trg, (hidden, cell))
99
+
100
+ # Combine context and decoder outputs
101
+ outputs = outputs + context.unsqueeze(
102
+ 1
103
+ ) # Add context to decoder outputs (simple fusion)
104
+
105
+ # Output layer
106
+ predictions = self.fc(outputs)
107
+ return predictions
108
+
109
+ def generate(self, question, vocab, max_len):
110
+ self.eval()
111
+ tokenized_question = tokenize(question, vocab)
112
+ padded_question = pad_sequences([tokenized_question], max_len)
113
+ src = torch.tensor(padded_question, dtype=torch.long)
114
+
115
+ trg = torch.zeros((1, max_len), dtype=torch.long)
116
+ trg[0, 0] = vocab["<SOS>"]
117
+
118
+ with torch.no_grad():
119
+ for i in range(1, max_len):
120
+ output = self.forward(src, trg[:, :i])
121
+ next_token = output.argmax(2)[:, -1]
122
+ trg[0, i] = next_token.item()
123
+ if next_token.item() == vocab["<EOS>"]:
124
+ break
125
+
126
+ answer_tokens = trg[0].tolist()
127
+ answer = " ".join(
128
+ [
129
+ list(vocab.keys())[list(vocab.values()).index(token)]
130
+ for token in answer_tokens
131
+ if token not in [vocab["<PAD>"], vocab["<SOS>"], vocab["<EOS>"]]
132
+ ]
133
+ )
134
+ return answer
135
+
136
+
137
+ def train_model(file):
138
+ with open(file, "r") as f:
139
+ data = json.load(f)
140
+
141
+ # Extract questions and answers
142
+ questions = [item["question"] for item in data]
143
+ answers = [item["answer"] for item in data]
144
+
145
+ # Split data into train and test sets
146
+ train_questions, test_questions, train_answers, test_answers = train_test_split(
147
+ questions, answers, test_size=0.25, random_state=42
148
+ )
149
+
150
+ # Build vocabulary and tokenize data
151
+ vocab = build_vocab(train_questions + train_answers)
152
+ tokenized_train_questions = [tokenize(q, vocab) for q in train_questions]
153
+ tokenized_train_answers = [tokenize(a, vocab) for a in train_answers]
154
+ tokenized_test_questions = [tokenize(q, vocab) for q in test_questions]
155
+ tokenized_test_answers = [tokenize(a, vocab) for a in test_answers]
156
+
157
+ # Find the maximum sequence length
158
+ max_len = max(
159
+ max(len(seq) for seq in tokenized_train_questions + tokenized_train_answers),
160
+ max(len(seq) for seq in tokenized_test_questions + tokenized_test_answers),
161
+ )
162
+
163
+ print(f"Using max_len: {max_len}")
164
+
165
+ # Pad sequences
166
+ padded_train_questions = pad_sequences(tokenized_train_questions, max_len)
167
+ padded_train_answers = pad_sequences(tokenized_train_answers, max_len)
168
+ padded_test_questions = pad_sequences(tokenized_test_questions, max_len)
169
+ padded_test_answers = pad_sequences(tokenized_test_answers, max_len)
170
+
171
+ # Convert data to PyTorch tensors
172
+ train_src = torch.tensor(padded_train_questions, dtype=torch.long)
173
+ train_trg = torch.tensor(padded_train_answers, dtype=torch.long)
174
+ test_src = torch.tensor(padded_test_questions, dtype=torch.long)
175
+ test_trg = torch.tensor(padded_test_answers, dtype=torch.long)
176
+
177
+ # Hyperparameters
178
+ vocab_size = len(vocab)
179
+ embedding_dim = 64
180
+ hidden_dim = 128
181
+ model = Seq2Seq(vocab_size, embedding_dim, hidden_dim)
182
+
183
+ # Loss and optimizer
184
+ criterion = nn.CrossEntropyLoss(ignore_index=0) # Ignore padding tokens
185
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
186
+
187
+ # Training loop with teacher forcing
188
+ epochs = 800
189
+ for epoch in range(epochs):
190
+ optimizer.zero_grad()
191
+ output = model(train_src, train_trg[:, :-1]) # Exclude last token from target
192
+ loss = criterion(
193
+ output.transpose(1, 2), train_trg[:, 1:]
194
+ ) # Exclude first token from target
195
+ loss.backward()
196
+ optimizer.step()
197
+ print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")
198
+ accuracy = evaluate_model(model, test_questions, test_answers, vocab, max_len)
199
+ print(f"Test Accuracy: {accuracy * 100:.2f}%")
200
+ return model, vocab, max_len, vocab_size, embedding_dim, hidden_dim
201
+
202
+
203
+ def generate_answer(model, question, vocab, max_len=34):
204
+ model.eval()
205
+ tokenized_question = tokenize(question, vocab)
206
+ padded_question = pad_sequences([tokenized_question], max_len)
207
+ src = torch.tensor(padded_question, dtype=torch.long)
208
+
209
+ # Initialize decoder input with <SOS> token
210
+ trg = torch.zeros((1, max_len), dtype=torch.long)
211
+ trg[0, 0] = vocab["<SOS>"]
212
+
213
+ with torch.no_grad():
214
+ for i in range(1, max_len):
215
+ output = model(src, trg[:, :i])
216
+ next_token = output.argmax(2)[:, -1]
217
+ trg[0, i] = next_token.item()
218
+ if next_token.item() == vocab["<EOS>"]:
219
+ break
220
+
221
+ # Convert tokens to words
222
+ answer_tokens = trg[0].tolist()
223
+ answer = " ".join(
224
+ [
225
+ list(vocab.keys())[list(vocab.values()).index(token)]
226
+ for token in answer_tokens
227
+ if token not in [vocab["<PAD>"], vocab["<SOS>"], vocab["<EOS>"]]
228
+ ]
229
+ )
230
+ return answer
seq2seq_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5028eca7f654efeac7e8ef5f9d859e4c585c6aadf16d6fb6aabca082f9e0213e
3
+ size 1051288
temp.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ from main_model import Seq2Seq , generate_answer
4
+
5
+
6
+ with open("./config.json", "r") as f:
7
+ config = json.load(f)
8
+
9
+ vocab_size = config["vocab_size"]
10
+ embedding_dim = config["embedding_dim"]
11
+ hidden_dim = config["hidden_dim"]
12
+ max_len = config["max_len"]
13
+
14
+ # Initialize Model
15
+ model = Seq2Seq(vocab_size, embedding_dim, hidden_dim)
16
+ model.load_state_dict(torch.load("./seq2seq_model.pth",weights_only=True))
17
+ model.eval() # Set model to evaluation mode
18
+
19
+ with open("./ma_vocab.json", "r") as f:
20
+ vocab = json.load(f)
21
+
22
+ # Create mappings
23
+ word2idx = vocab
24
+ idx2word = {idx: word for word, idx in vocab.items()}
25
+
26
+
27
+ question = "what is MA?"
28
+ answer = generate_answer(model, question, vocab=word2idx)
29
+ print("Answer:", answer)