Spaces:
Runtime error
Runtime error
Commit
·
83d9f95
1
Parent(s):
8910711
Update with h2oGPT hash d2fec0293c2259c210f6d808282cb70b2466130b
Browse files
app.py
CHANGED
|
@@ -34,6 +34,7 @@ admin_pass = os.getenv("ADMIN_PASS")
|
|
| 34 |
# will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
|
| 35 |
raise_generate_gpu_exceptions = True
|
| 36 |
|
|
|
|
| 37 |
|
| 38 |
def main(
|
| 39 |
load_8bit: bool = False,
|
|
@@ -144,12 +145,12 @@ def main(
|
|
| 144 |
if not gradio:
|
| 145 |
if eval_sharegpt_prompts_only > 0:
|
| 146 |
# override default examples with shareGPT ones for human-level eval purposes only
|
| 147 |
-
|
| 148 |
-
if not os.path.isfile(
|
| 149 |
os.system(
|
| 150 |
-
'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' %
|
| 151 |
import json
|
| 152 |
-
data = json.load(open(
|
| 153 |
# focus on data that starts with human, else likely chopped from other data
|
| 154 |
turn_start = 0 # odd in general
|
| 155 |
data = [x for x in data if len(x['conversations']) > turn_start + 1 and
|
|
@@ -165,12 +166,29 @@ def main(
|
|
| 165 |
assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
|
| 166 |
output = data[i]['conversations'][turn_start + 1]['value']
|
| 167 |
examplenew = example1.copy()
|
| 168 |
-
|
| 169 |
-
examplenew[
|
| 170 |
-
examplenew[
|
|
|
|
| 171 |
examples.append(examplenew)
|
| 172 |
responses.append(output)
|
| 173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
with torch.device("cuda"):
|
| 175 |
# ensure was set right above before examples generated
|
| 176 |
assert not stream_output, "stream_output=True does not make sense with example loop"
|
|
@@ -183,7 +201,7 @@ def main(
|
|
| 183 |
if not eval_sharegpt_as_output:
|
| 184 |
model, tokenizer, device = get_model(**locals())
|
| 185 |
model_state = [model, tokenizer, device, base_model]
|
| 186 |
-
fun = partial(evaluate, model_state, debug=debug,
|
| 187 |
else:
|
| 188 |
assert eval_sharegpt_prompts_only > 0
|
| 189 |
|
|
@@ -194,15 +212,17 @@ def main(
|
|
| 194 |
fun = get_response
|
| 195 |
t0 = time.time()
|
| 196 |
score_dump = []
|
| 197 |
-
num_examples = len(examples)
|
| 198 |
|
| 199 |
import matplotlib.pyplot as plt
|
| 200 |
|
| 201 |
for exi, ex in enumerate(examples):
|
|
|
|
|
|
|
|
|
|
| 202 |
clear_torch_cache()
|
| 203 |
print("")
|
| 204 |
print("START" + "=" * 100)
|
| 205 |
-
print("Question: %s %s" % (
|
| 206 |
print("-" * 105)
|
| 207 |
# fun yields as generator, so have to iterate over it
|
| 208 |
# Also means likely do NOT want --stream_output=True, else would show all generations
|
|
@@ -211,14 +231,14 @@ def main(
|
|
| 211 |
if smodel:
|
| 212 |
score_with_prompt = False
|
| 213 |
if score_with_prompt:
|
| 214 |
-
data_point = dict(instruction=
|
| 215 |
prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
|
| 216 |
prompt = prompter.generate_prompt(data_point)
|
| 217 |
else:
|
| 218 |
# just raw input and output
|
| 219 |
-
assert
|
| 220 |
-
assert
|
| 221 |
-
prompt =
|
| 222 |
cutoff_len = 768 if is_low_mem else 2048
|
| 223 |
inputs = stokenizer(prompt, res,
|
| 224 |
return_tensors="pt",
|
|
@@ -246,30 +266,16 @@ def main(
|
|
| 246 |
print("SCORE %s: %s" % (exi, score), flush=True)
|
| 247 |
score_dump.append(ex + [prompt, res, score])
|
| 248 |
# dump every score in case abort
|
| 249 |
-
scoring_path = 'scoring'
|
| 250 |
-
os.makedirs(scoring_path, exist_ok=True)
|
| 251 |
-
if eval_sharegpt_as_output:
|
| 252 |
-
used_base_model = 'gpt35'
|
| 253 |
-
used_lora_weights = ''
|
| 254 |
-
else:
|
| 255 |
-
used_base_model = str(base_model.split('/')[-1])
|
| 256 |
-
used_lora_weights = str(lora_weights.split('/')[-1])
|
| 257 |
df_scores = pd.DataFrame(score_dump,
|
| 258 |
-
columns=eval_func_param_names +
|
| 259 |
-
|
| 260 |
-
eval_sharegpt_prompts_only_seed,
|
| 261 |
-
eval_sharegpt_as_output,
|
| 262 |
-
used_base_model,
|
| 263 |
-
used_lora_weights)
|
| 264 |
-
filename = os.path.join(scoring_path, filename)
|
| 265 |
-
df_scores.to_parquet(filename, index=False)
|
| 266 |
# plot histogram so far
|
| 267 |
plt.figure(figsize=(10, 10))
|
| 268 |
plt.hist(df_scores['score'], bins=20)
|
| 269 |
score_avg = np.mean(df_scores['score'])
|
| 270 |
score_median = np.median(df_scores['score'])
|
| 271 |
plt.title("Score avg: %s median: %s" % (score_avg, score_median))
|
| 272 |
-
plt.savefig(
|
| 273 |
plt.close()
|
| 274 |
|
| 275 |
print("END" + "=" * 102)
|
|
@@ -278,7 +284,8 @@ def main(
|
|
| 278 |
print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
|
| 279 |
t1 = time.time()
|
| 280 |
print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
|
| 281 |
-
return
|
|
|
|
| 282 |
if gradio:
|
| 283 |
go_gradio(**locals())
|
| 284 |
|
|
@@ -774,7 +781,7 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
|
|
| 774 |
visible=not is_public and False)
|
| 775 |
do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
|
| 776 |
value=kwargs['do_sample'])
|
| 777 |
-
temperature = gr.Slider(minimum=0, maximum=3,
|
| 778 |
value=kwargs['temperature'],
|
| 779 |
label="Temperature",
|
| 780 |
info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
|
|
@@ -984,6 +991,11 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
|
|
| 984 |
instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
|
| 985 |
question = args_list[instruction_nochat_arg_id]
|
| 986 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
question = question[-cutoff_len:]
|
| 988 |
answer = answer[-cutoff_len:]
|
| 989 |
|
|
@@ -1307,10 +1319,12 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
|
|
| 1307 |
outputs=[model_state, model_used, lora_used, prompt_type])
|
| 1308 |
prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
|
| 1309 |
chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
|
|
|
|
| 1310 |
if not is_public:
|
| 1311 |
load_model_event = load_model_button.click(**load_model_args) \
|
| 1312 |
.then(**prompt_update_args) \
|
| 1313 |
.then(**chatbot_update_args) \
|
|
|
|
| 1314 |
.then(clear_torch_cache)
|
| 1315 |
|
| 1316 |
load_model_args2 = dict(fn=load_model,
|
|
@@ -1735,6 +1749,7 @@ def get_generate_params(model_lower, chat,
|
|
| 1735 |
if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
|
| 1736 |
prompt_type = inv_prompt_type_to_model_lower[model_lower]
|
| 1737 |
|
|
|
|
| 1738 |
if show_examples is None:
|
| 1739 |
if chat:
|
| 1740 |
show_examples = False
|
|
@@ -1831,6 +1846,7 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
|
|
| 1831 |
repetition_penalty = repetition_penalty or 1.07
|
| 1832 |
num_return_sequences = min(num_beams, num_return_sequences or 1)
|
| 1833 |
do_sample = False if do_sample is None else do_sample
|
|
|
|
| 1834 |
params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
|
| 1835 |
early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
|
| 1836 |
|
|
@@ -1874,10 +1890,11 @@ y = np.random.randint(0, 1, 100)
|
|
| 1874 |
src_lang = "English"
|
| 1875 |
tgt_lang = "Russian"
|
| 1876 |
|
| 1877 |
-
#
|
| 1878 |
-
|
| 1879 |
-
|
| 1880 |
-
|
|
|
|
| 1881 |
example[eval_func_param_names.index('instruction_nochat')] = example[
|
| 1882 |
eval_func_param_names.index('instruction')]
|
| 1883 |
example[eval_func_param_names.index('instruction')] = ''
|
|
|
|
| 34 |
# will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
|
| 35 |
raise_generate_gpu_exceptions = True
|
| 36 |
|
| 37 |
+
eval_extra_columns = ['prompt', 'response', 'score']
|
| 38 |
|
| 39 |
def main(
|
| 40 |
load_8bit: bool = False,
|
|
|
|
| 145 |
if not gradio:
|
| 146 |
if eval_sharegpt_prompts_only > 0:
|
| 147 |
# override default examples with shareGPT ones for human-level eval purposes only
|
| 148 |
+
eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
|
| 149 |
+
if not os.path.isfile(eval_filename):
|
| 150 |
os.system(
|
| 151 |
+
'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
|
| 152 |
import json
|
| 153 |
+
data = json.load(open(eval_filename, 'rt'))
|
| 154 |
# focus on data that starts with human, else likely chopped from other data
|
| 155 |
turn_start = 0 # odd in general
|
| 156 |
data = [x for x in data if len(x['conversations']) > turn_start + 1 and
|
|
|
|
| 166 |
assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
|
| 167 |
output = data[i]['conversations'][turn_start + 1]['value']
|
| 168 |
examplenew = example1.copy()
|
| 169 |
+
assert not chat, "No gradio must use chat=False, uses nochat isntruct"
|
| 170 |
+
examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
|
| 171 |
+
examplenew[eval_func_param_names.index('iinput_nochat')] = '' # no input
|
| 172 |
+
examplenew[eval_func_param_names.index('context')] = '' # no context
|
| 173 |
examples.append(examplenew)
|
| 174 |
responses.append(output)
|
| 175 |
|
| 176 |
+
num_examples = len(examples)
|
| 177 |
+
scoring_path = 'scoring'
|
| 178 |
+
os.makedirs(scoring_path, exist_ok=True)
|
| 179 |
+
if eval_sharegpt_as_output:
|
| 180 |
+
used_base_model = 'gpt35'
|
| 181 |
+
used_lora_weights = ''
|
| 182 |
+
else:
|
| 183 |
+
used_base_model = str(base_model.split('/')[-1])
|
| 184 |
+
used_lora_weights = str(lora_weights.split('/')[-1])
|
| 185 |
+
eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
|
| 186 |
+
eval_sharegpt_prompts_only_seed,
|
| 187 |
+
eval_sharegpt_as_output,
|
| 188 |
+
used_base_model,
|
| 189 |
+
used_lora_weights)
|
| 190 |
+
eval_filename = os.path.join(scoring_path, eval_filename)
|
| 191 |
+
|
| 192 |
with torch.device("cuda"):
|
| 193 |
# ensure was set right above before examples generated
|
| 194 |
assert not stream_output, "stream_output=True does not make sense with example loop"
|
|
|
|
| 201 |
if not eval_sharegpt_as_output:
|
| 202 |
model, tokenizer, device = get_model(**locals())
|
| 203 |
model_state = [model, tokenizer, device, base_model]
|
| 204 |
+
fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir)
|
| 205 |
else:
|
| 206 |
assert eval_sharegpt_prompts_only > 0
|
| 207 |
|
|
|
|
| 212 |
fun = get_response
|
| 213 |
t0 = time.time()
|
| 214 |
score_dump = []
|
|
|
|
| 215 |
|
| 216 |
import matplotlib.pyplot as plt
|
| 217 |
|
| 218 |
for exi, ex in enumerate(examples):
|
| 219 |
+
instruction = ex[eval_func_param_names.index('instruction_nochat')]
|
| 220 |
+
iinput = ex[eval_func_param_names.index('iinput_nochat')]
|
| 221 |
+
context = ex[eval_func_param_names.index('context')]
|
| 222 |
clear_torch_cache()
|
| 223 |
print("")
|
| 224 |
print("START" + "=" * 100)
|
| 225 |
+
print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
|
| 226 |
print("-" * 105)
|
| 227 |
# fun yields as generator, so have to iterate over it
|
| 228 |
# Also means likely do NOT want --stream_output=True, else would show all generations
|
|
|
|
| 231 |
if smodel:
|
| 232 |
score_with_prompt = False
|
| 233 |
if score_with_prompt:
|
| 234 |
+
data_point = dict(instruction=instruction, input=iinput, context=context)
|
| 235 |
prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
|
| 236 |
prompt = prompter.generate_prompt(data_point)
|
| 237 |
else:
|
| 238 |
# just raw input and output
|
| 239 |
+
assert iinput in [None, ''] # should be no iinput
|
| 240 |
+
assert context in [None, ''] # should be no context
|
| 241 |
+
prompt = instruction
|
| 242 |
cutoff_len = 768 if is_low_mem else 2048
|
| 243 |
inputs = stokenizer(prompt, res,
|
| 244 |
return_tensors="pt",
|
|
|
|
| 266 |
print("SCORE %s: %s" % (exi, score), flush=True)
|
| 267 |
score_dump.append(ex + [prompt, res, score])
|
| 268 |
# dump every score in case abort
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
df_scores = pd.DataFrame(score_dump,
|
| 270 |
+
columns=eval_func_param_names + eval_extra_columns)
|
| 271 |
+
df_scores.to_parquet(eval_filename, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
# plot histogram so far
|
| 273 |
plt.figure(figsize=(10, 10))
|
| 274 |
plt.hist(df_scores['score'], bins=20)
|
| 275 |
score_avg = np.mean(df_scores['score'])
|
| 276 |
score_median = np.median(df_scores['score'])
|
| 277 |
plt.title("Score avg: %s median: %s" % (score_avg, score_median))
|
| 278 |
+
plt.savefig(eval_filename.replace('.parquet', '.png'))
|
| 279 |
plt.close()
|
| 280 |
|
| 281 |
print("END" + "=" * 102)
|
|
|
|
| 284 |
print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
|
| 285 |
t1 = time.time()
|
| 286 |
print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
|
| 287 |
+
return eval_filename
|
| 288 |
+
|
| 289 |
if gradio:
|
| 290 |
go_gradio(**locals())
|
| 291 |
|
|
|
|
| 781 |
visible=not is_public and False)
|
| 782 |
do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
|
| 783 |
value=kwargs['do_sample'])
|
| 784 |
+
temperature = gr.Slider(minimum=0.01, maximum=3,
|
| 785 |
value=kwargs['temperature'],
|
| 786 |
label="Temperature",
|
| 787 |
info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
|
|
|
|
| 991 |
instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
|
| 992 |
question = args_list[instruction_nochat_arg_id]
|
| 993 |
|
| 994 |
+
if question is None:
|
| 995 |
+
return 'Response Score: Bad Question'
|
| 996 |
+
if answer is None:
|
| 997 |
+
return 'Response Score: Bad Answer'
|
| 998 |
+
|
| 999 |
question = question[-cutoff_len:]
|
| 1000 |
answer = answer[-cutoff_len:]
|
| 1001 |
|
|
|
|
| 1319 |
outputs=[model_state, model_used, lora_used, prompt_type])
|
| 1320 |
prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
|
| 1321 |
chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
|
| 1322 |
+
nochat_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output_nochat)
|
| 1323 |
if not is_public:
|
| 1324 |
load_model_event = load_model_button.click(**load_model_args) \
|
| 1325 |
.then(**prompt_update_args) \
|
| 1326 |
.then(**chatbot_update_args) \
|
| 1327 |
+
.then(**nochat_update_args) \
|
| 1328 |
.then(clear_torch_cache)
|
| 1329 |
|
| 1330 |
load_model_args2 = dict(fn=load_model,
|
|
|
|
| 1749 |
if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
|
| 1750 |
prompt_type = inv_prompt_type_to_model_lower[model_lower]
|
| 1751 |
|
| 1752 |
+
# examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
|
| 1753 |
if show_examples is None:
|
| 1754 |
if chat:
|
| 1755 |
show_examples = False
|
|
|
|
| 1846 |
repetition_penalty = repetition_penalty or 1.07
|
| 1847 |
num_return_sequences = min(num_beams, num_return_sequences or 1)
|
| 1848 |
do_sample = False if do_sample is None else do_sample
|
| 1849 |
+
# doesn't include chat, instruction_nochat, iinput_nochat, added later
|
| 1850 |
params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
|
| 1851 |
early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
|
| 1852 |
|
|
|
|
| 1890 |
src_lang = "English"
|
| 1891 |
tgt_lang = "Russian"
|
| 1892 |
|
| 1893 |
+
# move to correct position
|
| 1894 |
+
for example in examples:
|
| 1895 |
+
example += [chat, '', '']
|
| 1896 |
+
# adjust examples if non-chat mode
|
| 1897 |
+
if not chat:
|
| 1898 |
example[eval_func_param_names.index('instruction_nochat')] = example[
|
| 1899 |
eval_func_param_names.index('instruction')]
|
| 1900 |
example[eval_func_param_names.index('instruction')] = ''
|