Ontocord.AI
commited on
Commit
·
9f3edac
1
Parent(s):
7d1de3e
Create README.md
Browse files
README.md
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
```
|
| 6 |
+
# test merged experts
|
| 7 |
+
# TODO: add dynamic routing, testing better expert mixtures
|
| 8 |
+
|
| 9 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
|
| 13 |
+
from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, GPTNeoXLayer
|
| 14 |
+
from torch import nn
|
| 15 |
+
class GPTNeoXExpertsForCasualLM(GPTNeoXForCausalLM):
|
| 16 |
+
""" Stores various experts for layers 9, 10 """ # , 11
|
| 17 |
+
def __init__(self, config):
|
| 18 |
+
super().__init__(config)
|
| 19 |
+
self.config = config
|
| 20 |
+
self.orig_chat = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
|
| 21 |
+
self.uspto_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
|
| 22 |
+
self.github_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
|
| 23 |
+
self.pubmed_abstracts_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
|
| 24 |
+
self.freelaw_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
|
| 25 |
+
self.arxiv_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
|
| 26 |
+
self.merged_chat_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
|
| 27 |
+
self.curr_expert = "merged_chat_expert"
|
| 28 |
+
|
| 29 |
+
def generate_with_expert(self, text, tokenizer, expert="merged_chat_expert", return_answer_only=False, do_self_contrastive=True, max_length=512, min_length=1, max_return_sequences=1, do_sample=True, do_beam=False, device="cuda", target_lang=None):
|
| 30 |
+
"""Generates using one of the experts."""
|
| 31 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 32 |
+
if expert != self.curr_expert:
|
| 33 |
+
self.curr_expert = expert
|
| 34 |
+
for layer_id in range(2):
|
| 35 |
+
if expert == "orig_chat":
|
| 36 |
+
self.gpt_neox.layers[layer_id+9] = self.orig_chat[layer_id]
|
| 37 |
+
elif expert == "uspto_expert":
|
| 38 |
+
self.gpt_neox.layers[layer_id+9] = self.uspto_expert[layer_id]
|
| 39 |
+
elif expert == "github_expert":
|
| 40 |
+
self.gpt_neox.layers[layer_id+9] = self.github_expert[layer_id]
|
| 41 |
+
elif expert == "pubmed_abstracts_expert":
|
| 42 |
+
self.gpt_neox.layers[layer_id+9] = self.pubmed_abstracts_expert[layer_id]
|
| 43 |
+
elif expert == "arxiv_expert":
|
| 44 |
+
self.gpt_neox.layers[layer_id+9] = self.arxiv_expert[layer_id]
|
| 45 |
+
elif expert == "freelaw_expert":
|
| 46 |
+
self.gpt_neox.layers[layer_id+9] = self.freelaw_expert[layer_id]
|
| 47 |
+
elif expert == "merged_chat_expert":
|
| 48 |
+
self.gpt_neox.layers[layer_id+9] = self.merged_chat_expert[layer_id]
|
| 49 |
+
|
| 50 |
+
if type(text) is str:
|
| 51 |
+
text = [text]
|
| 52 |
+
text = [p.strip() for p in text]
|
| 53 |
+
input_ids = tokenizer(text, return_tensors='pt',padding=True, truncation=True, max_length=max_length )
|
| 54 |
+
input_ids = input_ids.to(device)
|
| 55 |
+
with torch.no_grad():
|
| 56 |
+
outputs = self.generate(
|
| 57 |
+
**input_ids,
|
| 58 |
+
max_length=max_length,
|
| 59 |
+
repetition_penalty=1.1,
|
| 60 |
+
min_length=min_length,
|
| 61 |
+
do_sample=True,
|
| 62 |
+
top_p=0.95,
|
| 63 |
+
penalty_alpha=0.6 if do_self_contrastive else None,
|
| 64 |
+
top_k=10,
|
| 65 |
+
)
|
| 66 |
+
ret = []
|
| 67 |
+
for i in range(len(outputs)): # can use batch_decode, unless we want to do something special here
|
| 68 |
+
out = tokenizer.decode(outputs[i], skip_special_tokens=True)
|
| 69 |
+
if return_answer_only:
|
| 70 |
+
out = out[len(text[i]):].lstrip(".? \n\t")
|
| 71 |
+
ret.append(out)
|
| 72 |
+
|
| 73 |
+
return ret
|
| 74 |
+
|
| 75 |
+
tokenizer = AutoTokenizer.from_pretrained("theblackcat102/pythia-1b-deduped-sft")
|
| 76 |
+
|
| 77 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
model1 = GPTNeoXExpertsForCasualLM.from_pretrained("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts")
|
| 81 |
+
model1=model1.half().cuda().eval()
|
| 82 |
+
for expert in ["orig_chat", "merged_chat_expert", "uspto_expert", "github_expert", "pubmed_abstracts_expert", "arxiv_expert", "freelaw_expert"]:
|
| 83 |
+
print (f'## {expert}')
|
| 84 |
+
print (model1.generate_with_expert("<human> Write a patent about an electric toothbrush\n<bot>", tokenizer, expert=expert)[0])
|
| 85 |
+
print (f'## {expert} more')
|
| 86 |
+
print (model1.generate_with_expert("Field of the Invention.\nAn electric toothbrush\n", tokenizer, expert=expert)[0])
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
def recreate_merged_expert():
|
| 92 |
+
model1 = GPTNeoXExpertsForCasualLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float()
|
| 93 |
+
|
| 94 |
+
model2 = AutoModelForCausalLM.from_pretrained("stillerman/MDEL-pubmed-feelaw-github-arxiv").float()
|
| 95 |
+
|
| 96 |
+
model_uspto = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-uspto").float()
|
| 97 |
+
|
| 98 |
+
model_github = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-github").float()
|
| 99 |
+
model_pubmed_abstracts = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-pubmed_abstracts").float()
|
| 100 |
+
model_freelaw = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-freelaw").float()
|
| 101 |
+
model_arxiv = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-arxiv").float()
|
| 102 |
+
|
| 103 |
+
model = AutoModelForCausalLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float() # half().cuda().eval()
|
| 104 |
+
|
| 105 |
+
with torch.no_grad():
|
| 106 |
+
for layer_id in [9,10]: #9,10,11,12,13
|
| 107 |
+
model1.orig_chat[layer_id-9] = model.gpt_neox.layers[layer_id]
|
| 108 |
+
|
| 109 |
+
for layer_id in [9,10]: #9,10,11,12,13
|
| 110 |
+
for p1, p2, p3 in zip(model1.gpt_neox.layers[layer_id].parameters(), model2.gpt_neox.layers[layer_id].parameters(), model_uspto.gpt_neox.layers[layer_id].parameters()):
|
| 111 |
+
p1.data = p1.data*.6 + p2.data*0.3 + p3.data*0.1
|
| 112 |
+
model1.merged_chat_expert[layer_id-9] = model1.gpt_neox.layers[layer_id]
|
| 113 |
+
|
| 114 |
+
#model1.uspto_expert.layers_9_10_11 = []
|
| 115 |
+
for layer_id in [9,10]: #9,10,11,12,13
|
| 116 |
+
for p1, p2 in zip(model_uspto.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
|
| 117 |
+
p1.data = p1.data*.6 + p2.data*0.4
|
| 118 |
+
model1.uspto_expert[layer_id-9] = model_uspto.gpt_neox.layers[layer_id]
|
| 119 |
+
|
| 120 |
+
#model1.github_expert.layers_9_10_11 = []
|
| 121 |
+
for layer_id in [9,10]: #9,10,11,12,13
|
| 122 |
+
for p1, p2 in zip(model_github.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
|
| 123 |
+
p1.data = p1.data*.6 + p2.data*0.4
|
| 124 |
+
model1.github_expert[layer_id-9] = model_github.gpt_neox.layers[layer_id]
|
| 125 |
+
|
| 126 |
+
#model1.pubmed_abstracts_expert.layers_9_10_11 = []
|
| 127 |
+
for layer_id in [9,10]: #9,10,11,12,13
|
| 128 |
+
for p1, p2 in zip(model_pubmed_abstracts.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
|
| 129 |
+
p1.data = p1.data*.6 + p2.data*0.4
|
| 130 |
+
model1.pubmed_abstracts_expert[layer_id-9] = model_pubmed_abstracts.gpt_neox.layers[layer_id]
|
| 131 |
+
|
| 132 |
+
#model1.freelaw_expert.layers_9_10_11 = []
|
| 133 |
+
for layer_id in [9,10]: #9,10,11,12,13
|
| 134 |
+
for p1, p2 in zip(model_freelaw.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
|
| 135 |
+
p1.data = p1.data*.6 + p2.data*0.4
|
| 136 |
+
model1.freelaw_expert[layer_id-9] = model_freelaw.gpt_neox.layers[layer_id]
|
| 137 |
+
|
| 138 |
+
#model1.arxiv_expert.layers_9_10_11 = []
|
| 139 |
+
for layer_id in [9,10]: #9,10,11,12,13
|
| 140 |
+
for p1, p2 in zip(model_arxiv.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
|
| 141 |
+
p1.data = p1.data*.6 + p2.data*0.4
|
| 142 |
+
model1.arxiv_expert[layer_id-9] = model_arxiv.gpt_neox.layers[layer_id]
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
model1 = model1.half().eval()
|
| 147 |
+
model1.save_pretrained("MDEL-theblackcat-chat-5-experts", torch_dtype=torch.float16)
|
| 148 |
+
model1.push_to_hub("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts")
|
| 149 |
+
return model1
|
| 150 |
+
```
|