Spaces:

jijivski
/

FreshBench

Build error

App Files Files Community

jijivski commited on Mar 16, 2024

Commit

9882e38

1 Parent(s): e2bf898

hover and question _ppl

Browse files

Files changed (4) hide show

data/mata_df.csv +0 -0
data/model_release_time.csv +36 -0
gradio_samples/gradio_hover.py +93 -0
gradio_samples/web_ui.py +309 -0

data/mata_df.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/model_release_time.csv ADDED Viewed

	@@ -0,0 +1,36 @@

+Model,Release Date,model,MMLU,GSM8,Humanities,SocialSciences,STEM,Other,Longbench
+Baichuan2-13B-Base,2023-08-24,Baichuan2-13B-Base,58.1,52.7,51.54,66.2,47.89,65.24,62.55
+Baichuan2-13B-Chat,2023-06-24,Baichuan2-13B-Chat,52.1,55.0,50.71,65.19,47.13,65.01,
+Baichuan2-7B-Base,2023-08-24,Baichuan2-7B-Base,54.0,24.4,46.87,58.73,42.63,58.9,16.32
+Baichuan2-7B-Chat,2023-08-24,Baichuan2-7B-Chat,52.9,32.0,46.44,58.82,41.93,59.22,32.22
+Colossal-LLaMA-2-7b-base,2023-09-24,Colossal-LLaMA-2-7b-base,53.06,9.0,73.1,75.0,34.8,44.0,23.83
+HF_RWKV_v5-Eagle-7B,2023-11-15,HF_RWKV_v5-Eagle-7B,33.04,9.3,32.58,34.94,28.29,36.66,19.33
+Llama-2-13b-hf,2023-07-18,Llama-2-13b-hf,55.77,22.8,76.0,82.0,28.6,46.4,7.39
+Llama-2-7b-hf,2023-07-18,Llama-2-7b-hf,46.87,14.4,70.2,65.0,38.4,42.2,15.29
+Qwen-14B-Chat,2023-09-24,Qwen-14B-Chat,66.5,59.0,58.24,74.78,56.87,70.78,38.72
+Qwen-1_8B,2023-11-30,Qwen-1_8B,45.3,32.0,40.77,50.93,37.04,51.92,35.4
+Qwen-1_8B-Chat,2023-11-30,Qwen-1_8B-Chat,43.99,4.0,39.91,50.08,38.47,49.73,14.39
+Qwen-7B,2023-09-24,Qwen-7B,59.84,44.9,75.4,80.0,37.5,48.2,45.53
+Qwen-7B-Chat,2023-09-24,Qwen-7B-Chat,57.0,54.0,47.86,64.32,46.91,61.64,33.89
+Skywork-13B-base,2023-10-22,Skywork-13B-base,62.1,55.0,56.62,70.13,47.19,67.69,23.48
+TinyLlama-1.1B-Chat-v0.6,2023-11-24,TinyLlama-1.1B-Chat-v0.6,25.98,2.1,22.2,28.0,32.1,23.5,5.05
+Yi-6B,2024-01-17,Yi-6B,64.11,12.1,83.0,85.0,42.9,45.8,38.89
+Yi-6B-Chat,2024-01-17,Yi-6B-Chat,58.24,38.4,55.75,72.41,51.57,69.91,39.54
+baichuan-13b-chat,2023-06-24,baichuan-13b-chat,52.1,0.1,43.95,56.48,38.19,56.2,9.29
+baichuan-7b-chat,2023-09-24,baichuan-7b-chat,42.8,9.1,40.83,46.96,35.17,47.28,23.3
+chatglm3-6b,2023-10-24,chatglm3-6b,61.4,72.0,46.12,59.12,42.82,56.49,
+falcon-rw-1b,2023-04-24,falcon-rw-1b,25.28,0.5,29.2,28.0,29.5,28.9,5.31
+interlm-20b,2023-09-18,interlm-20b,61.85,23.0,,,,,
+internlm-chat-7b,2023-06-06,internlm-chat-7b,50.8,34.0,46.08,58.56,40.28,56.68,9.73
+llama2-7b-chat-hf,2023-07-18,llama2-7b-chat-hf,48.32,45.5,72.5,72.0,30.4,43.4,19.58
+llama_hf_7b,2023-02-18,llama_hf_7b,46.87,10.0,31.99,31.75,28.35,36.63,4.7
+mistral-7b-v0.1,2023-09-18,mistral-7b-v0.1,64.16,37.8,83.0,86.0,48.2,55.4,32.2
+opt-13b,2022-05-11,opt-13b,24.9,1.7,31.0,27.0,36.6,25.9,
+opt-2.7b,2022-05-11,opt-2.7b,25.43,0.2,26.67,24.63,26.86,24.01,66.67
+phi-1_5,2023-08-18,phi-1_5,43.89,12.4,46.8,64.0,38.4,41.0,8.07
+phi-2,2023-12-24,phi-2,58.11,54.8,69.0,77.0,49.1,47.6,5.49
+pythia-12b,2023-02-24,pythia-12b,26.76,1.7,30.4,29.0,33.0,31.9,
+vicuna-7b-v1.5,2023-07-24,vicuna-7b-v1.5,50.82,8.1,71.3,76.0,44.6,42.8,20.39
+xverse-13b,2023-08-06,xverse-13b,55.1,18.0,,,,,
+zephyr-7b-beta,2023-10-26,zephyr-7b-beta,60.7,11.3,80.7,78.0,34.8,51.2,35.74
+zhongjing-base,2023-09-24,zhongjing-base,48.23,26.0,75.4,69.0,39.3,45.2,3.36

gradio_samples/gradio_hover.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# import gradio as gr
+# def generate_hoverable_html(text):
+#     # 分割文本为单词
+#     words = text.split()
+#     # 为每个单词创建一个带有悬停信息的 HTML span 元素
+#     html_words = [
+#         f'<span class="hoverable-word" data-info="Information about {word}">{word}</span>'
+#         for word in words
+#     ]
+#     # 将单词合并回字符串
+#     hoverable_html = ' '.join(html_words)
+#     # 添加 CSS 和 JavaScript
+#     custom_html = f"""
+#     <style>
+#         .hoverable-word {{
+#             color: blue;
+#             cursor: pointer;
+#         }}
+#         .hoverable-word:hover::after {{
+#             content: attr(data-info);
+#             color: white;
+#             background-color: black;
+#             padding: 4px;
+#             margin-left: 8px;
+#             position: absolute;
+#         }}
+#     </style>
+#     <div>{hoverable_html}</div>
+#     """
+#     return custom_html
+# # 创建 Gradio 界面
+# with gr.Blocks() as demo:
+#     with gr.Row():
+#         text_input = gr.Textbox(label="Input Text", placeholder="Type here...")
+#     output_html = gr.HTML()
+#     # 连接输入、处理函数和输出
+#     text_input.change(generate_hoverable_html, text_input, output_html)
+# demo.launch()
+import gradio as gr
+def generate_hoverable_html(text):
+    # 分割文本为单词
+    words = text.split()
+    prob_dic={'a':{'b':0.1,'c':0.2},'b':{'a':0.1,'c':0.2}}
+    # 为每个单词创建一个带有悬停信息的 HTML span 元素
+    html_words = [
+        f'<span class="hoverable-word" data-info="{prob_dic[word]}">{word}</span>'
+        for word in words
+    ]
+    # 将单词合并回字符串
+    hoverable_html = ' '.join(html_words)
+    # 添加 CSS 和 JavaScript
+    custom_html = f"""
+    <style>
+        .hoverable-word {{
+            color: blue;
+            cursor: pointer;
+        }}
+        .hoverable-word:hover::after {{
+            content: attr(data-info);
+            color: white;
+            background-color: black;
+            padding: 4px;
+            margin-left: 8px;
+            position: absolute;
+        }}
+    </style>
+    <div>{hoverable_html}</div>
+    """
+    return custom_html
+# 创建 Gradio 界面
+with gr.Blocks() as demo:
+    with gr.Row():
+        text_input = gr.Textbox(label="Input Text", placeholder="Type here...")
+    output_html = gr.HTML()
+    # 连接输入、处理函数和输出
+    text_input.change(generate_hoverable_html, text_input, output_html)
+demo.launch(debug=True)

gradio_samples/web_ui.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import time
+import gradio
+import numpy as np
+import torch
+from transformers import LogitsProcessor
+from modules import html_generator, shared
+params = {
+    'active': True,
+    'color_by_perplexity': False,
+    'color_by_probability': False,
+    'ppl_scale': 15.0,  # No slider for this right now, because I don't think it really needs to be changed. Very large perplexity scores don't show up often.
+    'probability_dropdown': False,
+    'verbose': False  # For debugging mostly
+}
+class PerplexityLogits(LogitsProcessor):
+    def __init__(self, verbose=False):
+        self.generated_token_ids = []
+        self.selected_probs = []
+        self.top_token_ids_list = []
+        self.top_probs_list = []
+        self.perplexities_list = []
+        self.last_probs = None
+        self.verbose = verbose
+    def __call__(self, input_ids, scores):
+        # t0 = time.time()
+        probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+        log_probs = torch.nan_to_num(torch.log(probs))  # Note: This is to convert log(0) nan to 0, but probs*log_probs makes this 0 not affect the perplexity.
+        entropy = -torch.sum(probs * log_probs)
+        entropy = entropy.cpu().numpy()
+        perplexity = round(float(np.exp(entropy)), 4)
+        self.perplexities_list.append(perplexity)
+        last_token_id = int(input_ids[0][-1].cpu().numpy().item())
+        # Store the generated tokens (not sure why this isn't accessible in the output endpoint!)
+        self.generated_token_ids.append(last_token_id)
+        # Get last probability, and add to the list if it wasn't there
+        if len(self.selected_probs) > 0:
+            # Is the selected token in the top tokens?
+            if self.verbose:
+                print('Probs: Token after', shared.tokenizer.decode(last_token_id))
+                print('Probs:', [shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1][0]])
+                print('Probs:', [round(float(prob), 4) for prob in self.top_probs_list[-1][0]])
+            if last_token_id in self.top_token_ids_list[-1][0]:
+                idx = self.top_token_ids_list[-1][0].index(last_token_id)
+                self.selected_probs.append(self.top_probs_list[-1][0][idx])
+            else:
+                self.top_token_ids_list[-1][0].append(last_token_id)
+                last_prob = round(float(self.last_probs[last_token_id]), 4)
+                self.top_probs_list[-1][0].append(last_prob)
+                self.selected_probs.append(last_prob)
+        else:
+            self.selected_probs.append(1.0)  # Placeholder for the last token of the prompt
+        if self.verbose:
+            pplbar = "-"
+            if not np.isnan(perplexity):
+                pplbar = "*" * round(perplexity)
+            print(f"PPL: Token after {shared.tokenizer.decode(last_token_id)}\t{perplexity:.2f}\t{pplbar}")
+        # Get top 5 probabilities
+        top_tokens_and_probs = torch.topk(probs, 5)
+        top_probs = top_tokens_and_probs.values.cpu().numpy().astype(float).tolist()
+        top_token_ids = top_tokens_and_probs.indices.cpu().numpy().astype(int).tolist()
+        self.top_token_ids_list.append(top_token_ids)
+        self.top_probs_list.append(top_probs)
+        probs = probs.cpu().numpy().flatten()
+        self.last_probs = probs  # Need to keep this as a reference for top probs
+        # t1 = time.time()
+        # print(f"PPL Processor: {(t1-t0):.3f} s")
+        # About 1 ms, though occasionally up to around 100 ms, not sure why...
+        # Doesn't actually modify the logits!
+        return scores
+# Stores the perplexity and top probabilities
+ppl_logits_processor = None
+def logits_processor_modifier(logits_processor_list, input_ids):
+    global ppl_logits_processor
+    if params['active']:
+        ppl_logits_processor = PerplexityLogits(verbose=params['verbose'])
+        logits_processor_list.append(ppl_logits_processor)
+def output_modifier(text):
+    global ppl_logits_processor
+    # t0 = time.time()
+    if not params['active']:
+        return text
+    # TODO: It's probably more efficient to do this above rather than modifying all these lists
+    # Remove last element of perplexities_list, top_token_ids_list, top_tokens_list, top_probs_list since everything is off by one because this extension runs before generation
+    perplexities = ppl_logits_processor.perplexities_list[:-1]
+    top_token_ids_list = ppl_logits_processor.top_token_ids_list[:-1]
+    top_tokens_list = [[shared.tokenizer.decode(token_id) for token_id in top_token_ids[0]] for top_token_ids in top_token_ids_list]
+    top_probs_list = ppl_logits_processor.top_probs_list[:-1]
+    # Remove first element of generated_token_ids, generated_tokens, selected_probs because they are for the last token of the prompt
+    gen_token_ids = ppl_logits_processor.generated_token_ids[1:]
+    gen_tokens = [shared.tokenizer.decode(token_id) for token_id in gen_token_ids]
+    sel_probs = ppl_logits_processor.selected_probs[1:]
+    end_part = '</div></div>' if params['probability_dropdown'] else '</span>'  # Helps with finding the index after replacing part of the text.
+    i = 0
+    for token, prob, ppl, top_tokens, top_probs in zip(gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
+        color = 'ffffff'
+        if params['color_by_probability'] and params['color_by_perplexity']:
+            color = probability_perplexity_color_scale(prob, ppl)
+        elif params['color_by_perplexity']:
+            color = perplexity_color_scale(ppl)
+        elif params['color_by_probability']:
+            color = probability_color_scale(prob)
+        if token in text[i:]:
+            if params['probability_dropdown']:
+                text = text[:i] + text[i:].replace(token, add_dropdown_html(token, color, top_tokens, top_probs[0], ppl), 1)
+            else:
+                text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
+            i += text[i:].find(end_part) + len(end_part)
+    # Use full perplexity list for calculating the average here.
+    print('Average perplexity:', round(np.mean(ppl_logits_processor.perplexities_list[:-1]), 4))
+    # t1 = time.time()
+    # print(f"Modifier: {(t1-t0):.3f} s")
+    # About 50 ms
+    return text
+def probability_color_scale(prob):
+    '''
+    Green-yellow-red color scale
+    '''
+    rv = 0
+    gv = 0
+    if prob <= 0.5:
+        rv = 'ff'
+        gv = hex(int(255 * prob * 2))[2:]
+        if len(gv) < 2:
+            gv = '0' * (2 - len(gv)) + gv
+    else:
+        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
+        gv = 'ff'
+        if len(rv) < 2:
+            rv = '0' * (2 - len(rv)) + rv
+    return rv + gv + '00'
+def perplexity_color_scale(ppl):
+    '''
+    Red component only, white for 0 perplexity (sorry if you're not in dark mode)
+    '''
+    value = hex(max(int(255.0 - params['ppl_scale'] * (float(ppl) - 1.0)), 0))[2:]
+    if len(value) < 2:
+        value = '0' * (2 - len(value)) + value
+    return 'ff' + value + value
+def probability_perplexity_color_scale(prob, ppl):
+    '''
+    Green-yellow-red for probability and blue component for perplexity
+    '''
+    rv = 0
+    gv = 0
+    bv = hex(min(max(int(params['ppl_scale'] * (float(ppl) - 1.0)), 0), 255))[2:]
+    if len(bv) < 2:
+        bv = '0' * (2 - len(bv)) + bv
+    if prob <= 0.5:
+        rv = 'ff'
+        gv = hex(int(255 * prob * 2))[2:]
+        if len(gv) < 2:
+            gv = '0' * (2 - len(gv)) + gv
+    else:
+        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
+        gv = 'ff'
+        if len(rv) < 2:
+            rv = '0' * (2 - len(rv)) + rv
+    return rv + gv + bv
+def add_color_html(token, color):
+    return f'<span style="color: #{color}">{token}</span>'
+# TODO: Major issue: Applying this to too many tokens will cause a permanent slowdown in generation speed until the messages are removed from the history.
+# I think the issue is from HTML elements taking up space in the visible history, and things like history deepcopy add latency proportional to the size of the history.
+# Potential solution is maybe to modify the main generation code to send just the internal text and not the visible history, to avoid moving too much around.
+# I wonder if we can also avoid using deepcopy here.
+def add_dropdown_html(token, color, top_tokens, top_probs, perplexity=0):
+    html = f'<div class="hoverable"><span style="color: #{color}">{token}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
+    for token_option, prob in zip(top_tokens, top_probs):
+        # TODO: Bold for selected token?
+        # Using divs prevented the problem of divs inside spans causing issues.
+        # Now the problem is that divs show the same whitespace of one space between every token.
+        # There is probably some way to fix this in CSS that I don't know about.
+        row_color = probability_color_scale(prob)
+        row_class = ' class="selected"' if token_option == token else ''
+        html += f'<tr{row_class}><td style="color: #{row_color}">{token_option}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
+    if perplexity != 0:
+        ppl_color = perplexity_color_scale(perplexity)
+        html += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
+    html += '</tbody></table></div></div>'
+    return html  # About 750 characters per token...
+def custom_css():
+    return """
+        .dropdown {
+            display: none;
+            position: absolute;
+            z-index: 50;
+            background-color: var(--block-background-fill);
+            box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+            width: max-content;
+            overflow: visible;
+            padding: 5px;
+            border-radius: 10px;
+            border: 1px solid var(--border-color-primary);
+        }
+        .dropdown-content {
+            border: none;
+            z-index: 50;
+        }
+        .dropdown-content tr.selected {
+            background-color: var(--block-label-background-fill);
+        }
+        .dropdown-content td {
+            color: var(--body-text-color);
+        }
+        .hoverable {
+            color: var(--body-text-color);
+            position: relative;
+            display: inline-block;
+            overflow: visible;
+            font-size: 15px;
+            line-height: 1.75;
+            margin: 0;
+            padding: 0;
+        }
+        .hoverable:hover .dropdown {
+            display: block;
+        }
+        pre {
+            white-space: pre-wrap;
+        }
+        # TODO: This makes the hover menus extend outside the bounds of the chat area, which is good.
+        # However, it also makes the scrollbar disappear, which is bad.
+        # The scroll bar needs to still be present. So for now, we can't see dropdowns that extend past the edge of the chat area.
+        #.chat {
+        #    overflow-y: auto;
+        #}
+    """
+# Monkeypatch applied to html_generator.py
+# We simply don't render markdown into HTML. We wrap everything in <pre> tags to preserve whitespace
+# formatting. If you're coloring tokens by perplexity or probability, or especially if you're using
+# the probability dropdown, you probably care more about seeing the tokens the model actually outputted
+# rather than rendering ```code blocks``` or *italics*.
+def convert_to_markdown(string):
+    return '<pre>' + string + '</pre>'
+html_generator.convert_to_markdown = convert_to_markdown
+def ui():
+    def update_active_check(x):
+        params.update({'active': x})
+    def update_color_by_ppl_check(x):
+        params.update({'color_by_perplexity': x})
+    def update_color_by_prob_check(x):
+        params.update({'color_by_probability': x})
+    def update_prob_dropdown_check(x):
+        params.update({'probability_dropdown': x})
+    active_check = gradio.Checkbox(value=True, label="Compute probabilities and perplexity scores", info="Activate this extension. Note that this extension currently does not work with exllama or llama.cpp.")
+    color_by_ppl_check = gradio.Checkbox(value=False, label="Color by perplexity", info="Higher perplexity is more red. If also showing probability, higher perplexity has more blue component.")
+    color_by_prob_check = gradio.Checkbox(value=False, label="Color by probability", info="Green-yellow-red linear scale, with 100% green, 50% yellow, 0% red.")
+    prob_dropdown_check = gradio.Checkbox(value=False, label="Probability dropdown", info="Hover over a token to show a dropdown of top token probabilities. Currently slightly buggy with whitespace between tokens.")
+    active_check.change(update_active_check, active_check, None)
+    color_by_ppl_check.change(update_color_by_ppl_check, color_by_ppl_check, None)
+    color_by_prob_check.change(update_color_by_prob_check, color_by_prob_check, None)
+    prob_dropdown_check.change(update_prob_dropdown_check, prob_dropdown_check, None)