File size: 5,373 Bytes
51be264 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import traceback
from queue import Queue
from threading import Thread
import fire
import gradio as gr
import torch
import moe_peft
class Iteratorize:
"""
Transforms a function that takes a callback
into a lazy iterator (generator).
"""
def __init__(self, func, kwargs={}, callback=None):
self.mfunc = func
self.c_callback = callback
self.q = Queue()
self.sentinel = object()
self.kwargs = kwargs
self.stop_now = False
def _callback(seq_pos, output):
if self.stop_now:
raise ValueError
self.q.put(output["default"][0])
def gentask():
try:
ret = self.mfunc(callback=_callback, **self.kwargs)
except ValueError:
pass
except:
traceback.print_exc()
pass
self.q.put(self.sentinel)
if self.c_callback:
self.c_callback(ret)
self.thread = Thread(target=gentask)
self.thread.start()
def __iter__(self):
return self
def __next__(self):
obj = self.q.get(True, None)
if obj is self.sentinel:
raise StopIteration
else:
return obj
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop_now = True
placeholder_text = "Could you provide an introduction to MoE-PEFT?"
def main(
base_model: str,
template: str = None,
lora_weights: str = "",
load_16bit: bool = True,
load_8bit: bool = False,
load_4bit: bool = False,
flash_attn: bool = False,
device: str = moe_peft.executor.default_device_name(),
server_name: str = "0.0.0.0",
share_gradio: bool = False,
):
model = moe_peft.LLMModel.from_pretrained(
base_model,
device=device,
attn_impl="flash_attn" if flash_attn else "eager",
bits=(8 if load_8bit else (4 if load_4bit else None)),
load_dtype=torch.bfloat16 if load_16bit else torch.float32,
)
tokenizer = moe_peft.Tokenizer(base_model)
if lora_weights:
model.load_adapter(lora_weights, "default")
else:
model.init_adapter(moe_peft.AdapterConfig(adapter_name="default"))
generation_config = moe_peft.GenerateConfig(
adapter_name="default",
prompt_template=template,
)
def evaluate(
instruction,
input="",
temperature=0.1,
top_p=0.75,
top_k=40,
repetition_penalty=1.1,
max_new_tokens=128,
stream_output=False,
):
instruction = instruction.strip()
if len(instruction) == 0:
instruction = placeholder_text
input = input.strip()
if len(input) == 0:
input = None
generation_config.prompts = [(instruction, input)]
generation_config.temperature = temperature
generation_config.top_p = top_p
generation_config.top_k = top_k
generation_config.repetition_penalty = repetition_penalty
generate_params = {
"model": model,
"tokenizer": tokenizer,
"configs": [generation_config],
"max_gen_len": max_new_tokens,
}
if stream_output:
# Stream the reply 1 token at a time.
def generate_with_callback(callback=None, **kwargs):
moe_peft.generate(stream_callback=callback, **kwargs)
def generate_with_streaming(**kwargs):
return Iteratorize(generate_with_callback, kwargs, callback=None)
with generate_with_streaming(**generate_params) as generator:
for output in generator:
yield output
return # early return for stream_output
# Without streaming
output = moe_peft.generate(**generate_params)
yield output["default"][0]
gr.Interface(
fn=evaluate,
inputs=[
gr.components.Textbox(
lines=2,
label="Instruction",
placeholder=placeholder_text,
),
gr.components.Textbox(lines=2, label="Input", placeholder="none"),
gr.components.Slider(minimum=0, maximum=1, value=1, label="Temperature"),
gr.components.Slider(
minimum=0, maximum=1, value=0.9, label="Sampling Top-P"
),
gr.components.Slider(
minimum=0, maximum=100, step=1, value=40, label="Sampling Top-K"
),
gr.components.Slider(
minimum=0, maximum=2, value=1.1, label="Repetition Penalty"
),
gr.components.Slider(
minimum=1,
maximum=model.config_.max_seq_len_,
step=1,
value=1024,
label="Max Tokens",
),
gr.components.Checkbox(label="Stream Output", value=True),
],
outputs=[
gr.components.Textbox(
lines=5,
label="Output",
)
],
title="MoE-PEFT LLM Evaluator",
description="Evaluate language models and LoRA weights", # noqa: E501
).queue().launch(server_name=server_name, share=share_gradio)
if __name__ == "__main__":
fire.Fire(main)
|