Upload 5 files

Browse files

Files changed (5) hide show

config.json +6 -0
ma_vocab.json +157 -0
main_model.py +230 -0
seq2seq_model.pth +3 -0
temp.py +29 -0

config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "vocab_size": 155,
+    "embedding_dim": 64,
+    "hidden_dim": 128,
+    "max_len": 32
+}

ma_vocab.json ADDED Viewed

	@@ -0,0 +1,157 @@

+{
+    "do": 4,
+    "medicaid": 5,
+    "advantage": 6,
+    "plans": 7,
+    "cover": 8,
+    "prescription": 9,
+    "drugs?": 10,
+    "who": 11,
+    "is": 12,
+    "eligible": 13,
+    "for": 14,
+    "a": 15,
+    "plan?": 16,
+    "can": 17,
+    "i": 18,
+    "keep": 19,
+    "my": 20,
+    "doctor": 21,
+    "with": 22,
+    "what": 23,
+    "the": 24,
+    "difference": 25,
+    "between": 26,
+    "and": 27,
+    "medicare?": 28,
+    "benefits": 29,
+    "provide?": 30,
+    "how": 31,
+    "enroll": 32,
+    "in": 33,
+    "are": 34,
+    "there": 35,
+    "any": 36,
+    "costs": 37,
+    "associated": 38,
+    "plans?": 39,
+    "switch": 40,
+    "from": 41,
+    "original": 42,
+    "medicare": 43,
+    "to": 44,
+    "does": 45,
+    "dental": 46,
+    "vision?": 47,
+    "yes,": 48,
+    "most": 49,
+    "include": 50,
+    "part": 51,
+    "d": 52,
+    "coverage,": 53,
+    "which": 54,
+    "helps": 55,
+    "pay": 56,
+    "drugs.": 57,
+    "individuals": 58,
+    "both": 59,
+    "b": 60,
+    "qualify": 61,
+    "their": 62,
+    "state": 63,
+    "plan.": 64,
+    "it": 65,
+    "depends": 66,
+    "on": 67,
+    "plan's": 68,
+    "provider": 69,
+    "network.": 70,
+    "some": 71,
+    "have": 72,
+    "preferred": 73,
+    "network": 74,
+    "of": 75,
+    "doctors,": 76,
+    "while": 77,
+    "others": 78,
+    "allow": 79,
+    "you": 80,
+    "see": 81,
+    "accepts": 82,
+    "medicaid.": 83,
+    "federal": 84,
+    "program": 85,
+    "people": 86,
+    "aged": 87,
+    "65+": 88,
+    "or": 89,
+    "certain": 90,
+    "disabilities,": 91,
+    "state-run": 92,
+    "that": 93,
+    "low-income": 94,
+    "healthcare": 95,
+    "costs.": 96,
+    "typically": 97,
+    "hospital": 98,
+    "medical": 99,
+    "drugs,": 100,
+    "dental,": 101,
+    "vision,": 102,
+    "hearing,": 103,
+    "transportation,": 104,
+    "sometimes": 105,
+    "additional": 106,
+    "like": 107,
+    "fitness": 108,
+    "programs.": 109,
+    "during": 110,
+    "medicare\u2019s": 111,
+    "annual": 112,
+    "enrollment": 113,
+    "period": 114,
+    "(aep)": 115,
+    "special": 116,
+    "(sep)": 117,
+    "if": 118,
+    "qualify.": 119,
+    "apply": 120,
+    "online,": 121,
+    "by": 122,
+    "phone,": 123,
+    "through": 124,
+    "an": 125,
+    "insurance": 126,
+    "provider.": 127,
+    "low": 128,
+    "no": 129,
+    "premiums.": 130,
+    "however,": 131,
+    "may": 132,
+    "vary": 133,
+    "depending": 134,
+    "plan,": 135,
+    "including": 136,
+    "copayments,": 137,
+    "deductibles,": 138,
+    "out-of-pocket": 139,
+    "expenses.": 140,
+    "also": 141,
+    "known": 142,
+    "as": 143,
+    "dual-eligible": 144,
+    "needs": 145,
+    "plan": 146,
+    "(d-snp),": 147,
+    "type": 148,
+    "designed": 149,
+    "medicare,": 150,
+    "period.": 151,
+    "many": 152,
+    "hearing": 153,
+    "coverage.": 154,
+    "<PAD>": 0,
+    "<UNK>": 1,
+    "<SOS>": 2,
+    "<EOS>": 3
+}

main_model.py ADDED Viewed

	@@ -0,0 +1,230 @@

+import numpy as np
+from collections import Counter
+import torch
+import torch.nn as nn
+import json
+from sklearn.model_selection import train_test_split
+def build_vocab(texts):
+    vocab = Counter()
+    for text in texts:
+        vocab.update(text.lower().split())
+    vocab = {
+        word: idx + 4 for idx, word in enumerate(vocab)
+    }  # +4 to reserve 0 for padding, 1 for unknown, 2 for <SOS>, 3 for <EOS>
+    vocab["<PAD>"] = 0
+    vocab["<UNK>"] = 1
+    vocab["<SOS>"] = 2
+    vocab["<EOS>"] = 3
+    with open("./model/ma_vocab.json", "w") as f:
+        json.dump(vocab, f, indent=4)
+    return vocab
+# Tokenize function
+def tokenize(text, vocab):
+    return (
+        [vocab["<SOS>"]]
+        + [vocab.get(word.lower(), vocab["<UNK>"]) for word in text.split()]
+        + [vocab["<EOS>"]]
+    )
+# Pad sequences
+def pad_sequences(sequences, max_len):
+    padded = np.zeros((len(sequences), max_len))
+    for i, seq in enumerate(sequences):
+        padded[i, : len(seq)] = seq
+    return padded
+def evaluate_model(model, test_questions, test_answers, vocab, max_len):
+    correct = 0
+    for i in range(len(test_questions)):
+        question = test_questions[i]
+        true_answer = test_answers[i]
+        generated_answer = Seq2Seq.generate(model, question, vocab, max_len)
+        print(f"Question: {question}")
+        print(f"True Answer: {true_answer}")
+        print(f"Generated Answer: {generated_answer}")
+        if generated_answer.lower() == true_answer.lower():
+            correct += 1
+    accuracy = correct / len(test_questions)
+    return accuracy
+# Define Attention Layer
+class Attention(nn.Module):
+    def __init__(self, hidden_dim):
+        super(Attention, self).__init__()
+        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)  # Attention layer
+        self.v = nn.Parameter(torch.rand(hidden_dim))  # Weight for attention
+    def forward(self, hidden, encoder_outputs):
+        seq_len = encoder_outputs.size(1)
+        hidden = hidden.unsqueeze(1).repeat(
+            1, seq_len, 1
+        )  # Repeat hidden state to match encoder output sequence length
+        energy = torch.tanh(
+            self.attn(torch.cat((hidden, encoder_outputs), dim=2))
+        )  # Apply attention mechanism
+        attention = torch.sum(self.v * energy, dim=2)  # Sum across hidden dim
+        return torch.softmax(attention, dim=1)
+# Define the Seq2Seq Model with Attention
+class Seq2Seq(nn.Module):
+    def __init__(self, vocab_size, embedding_dim, hidden_dim):
+        super(Seq2Seq, self).__init__()
+        self.embedding = nn.Embedding(vocab_size, embedding_dim)
+        self.encoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
+        self.decoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
+        self.attn = Attention(hidden_dim)  # Attention mechanism
+        self.fc = nn.Linear(hidden_dim, vocab_size)
+        self.dropout = nn.Dropout(0.5)  # Add dropout
+    def forward(self, src, trg):
+        # Encoder
+        embedded_src = self.dropout(self.embedding(src))
+        encoder_outputs, (hidden, cell) = self.encoder(embedded_src)
+        # Attention (if you're using it)
+        attn_weights = self.attn(hidden[-1], encoder_outputs)
+        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
+        # Decoder
+        embedded_trg = self.dropout(self.embedding(trg))
+        outputs, _ = self.decoder(embedded_trg, (hidden, cell))
+        # Combine context and decoder outputs
+        outputs = outputs + context.unsqueeze(
+            1
+        )  # Add context to decoder outputs (simple fusion)
+        # Output layer
+        predictions = self.fc(outputs)
+        return predictions
+    def generate(self, question, vocab, max_len):
+        self.eval()
+        tokenized_question = tokenize(question, vocab)
+        padded_question = pad_sequences([tokenized_question], max_len)
+        src = torch.tensor(padded_question, dtype=torch.long)
+        trg = torch.zeros((1, max_len), dtype=torch.long)
+        trg[0, 0] = vocab["<SOS>"]
+        with torch.no_grad():
+            for i in range(1, max_len):
+                output = self.forward(src, trg[:, :i])
+                next_token = output.argmax(2)[:, -1]
+                trg[0, i] = next_token.item()
+                if next_token.item() == vocab["<EOS>"]:
+                    break
+        answer_tokens = trg[0].tolist()
+        answer = " ".join(
+            [
+                list(vocab.keys())[list(vocab.values()).index(token)]
+                for token in answer_tokens
+                if token not in [vocab["<PAD>"], vocab["<SOS>"], vocab["<EOS>"]]
+            ]
+        )
+        return answer
+def train_model(file):
+    with open(file, "r") as f:
+        data = json.load(f)
+    # Extract questions and answers
+    questions = [item["question"] for item in data]
+    answers = [item["answer"] for item in data]
+    # Split data into train and test sets
+    train_questions, test_questions, train_answers, test_answers = train_test_split(
+        questions, answers, test_size=0.25, random_state=42
+    )
+    # Build vocabulary and tokenize data
+    vocab = build_vocab(train_questions + train_answers)
+    tokenized_train_questions = [tokenize(q, vocab) for q in train_questions]
+    tokenized_train_answers = [tokenize(a, vocab) for a in train_answers]
+    tokenized_test_questions = [tokenize(q, vocab) for q in test_questions]
+    tokenized_test_answers = [tokenize(a, vocab) for a in test_answers]
+    # Find the maximum sequence length
+    max_len = max(
+        max(len(seq) for seq in tokenized_train_questions + tokenized_train_answers),
+        max(len(seq) for seq in tokenized_test_questions + tokenized_test_answers),
+    )
+    print(f"Using max_len: {max_len}")
+    # Pad sequences
+    padded_train_questions = pad_sequences(tokenized_train_questions, max_len)
+    padded_train_answers = pad_sequences(tokenized_train_answers, max_len)
+    padded_test_questions = pad_sequences(tokenized_test_questions, max_len)
+    padded_test_answers = pad_sequences(tokenized_test_answers, max_len)
+    # Convert data to PyTorch tensors
+    train_src = torch.tensor(padded_train_questions, dtype=torch.long)
+    train_trg = torch.tensor(padded_train_answers, dtype=torch.long)
+    test_src = torch.tensor(padded_test_questions, dtype=torch.long)
+    test_trg = torch.tensor(padded_test_answers, dtype=torch.long)
+    # Hyperparameters
+    vocab_size = len(vocab)
+    embedding_dim = 64
+    hidden_dim = 128
+    model = Seq2Seq(vocab_size, embedding_dim, hidden_dim)
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding tokens
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+    # Training loop with teacher forcing
+    epochs = 800
+    for epoch in range(epochs):
+        optimizer.zero_grad()
+        output = model(train_src, train_trg[:, :-1])  # Exclude last token from target
+        loss = criterion(
+            output.transpose(1, 2), train_trg[:, 1:]
+        )  # Exclude first token from target
+        loss.backward()
+        optimizer.step()
+        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")
+    accuracy = evaluate_model(model, test_questions, test_answers, vocab, max_len)
+    print(f"Test Accuracy: {accuracy * 100:.2f}%")
+    return model, vocab, max_len, vocab_size, embedding_dim, hidden_dim
+def generate_answer(model, question, vocab, max_len=34):
+    model.eval()
+    tokenized_question = tokenize(question, vocab)
+    padded_question = pad_sequences([tokenized_question], max_len)
+    src = torch.tensor(padded_question, dtype=torch.long)
+    # Initialize decoder input with <SOS> token
+    trg = torch.zeros((1, max_len), dtype=torch.long)
+    trg[0, 0] = vocab["<SOS>"]
+    with torch.no_grad():
+        for i in range(1, max_len):
+            output = model(src, trg[:, :i])
+            next_token = output.argmax(2)[:, -1]
+            trg[0, i] = next_token.item()
+            if next_token.item() == vocab["<EOS>"]:
+                break
+    # Convert tokens to words
+    answer_tokens = trg[0].tolist()
+    answer = " ".join(
+        [
+            list(vocab.keys())[list(vocab.values()).index(token)]
+            for token in answer_tokens
+            if token not in [vocab["<PAD>"], vocab["<SOS>"], vocab["<EOS>"]]
+        ]
+    )
+    return answer

seq2seq_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5028eca7f654efeac7e8ef5f9d859e4c585c6aadf16d6fb6aabca082f9e0213e
+size 1051288

temp.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import json
+import torch
+from main_model import Seq2Seq , generate_answer
+with open("./config.json", "r") as f:
+    config = json.load(f)
+vocab_size = config["vocab_size"]
+embedding_dim = config["embedding_dim"]
+hidden_dim = config["hidden_dim"]
+max_len = config["max_len"]
+# Initialize Model
+model = Seq2Seq(vocab_size, embedding_dim, hidden_dim)
+model.load_state_dict(torch.load("./seq2seq_model.pth",weights_only=True))
+model.eval()  # Set model to evaluation mode
+with open("./ma_vocab.json", "r") as f:
+    vocab = json.load(f)
+# Create mappings
+word2idx = vocab
+idx2word = {idx: word for word, idx in vocab.items()}
+question = "what is MA?"
+answer = generate_answer(model, question, vocab=word2idx)
+print("Answer:", answer)