| import pandas as pd |
| import gradio as gr |
| from io import StringIO |
|
|
|
|
| |
| df = pd.DataFrame({ |
| "Rank" : [1, 2, 3, 4, 5], |
| "Detector" : [ |
| """ChatGPT_QA\n(Antoun et al., 2023)\n<a href='https://www.google.de' style="color:blue">Paper</a>""", |
| """XMLMR_ChatGPT\n(Guo et al., 2023)\n<a href='https://www.google.de' style="color:blue">Paper</a>""", |
| """LLMDET\n(Liu et al., 2019)\n<a href='https://www.google.de' style="color:blue">Paper</a>""", |
| """Radar_Vicuna7B\n(Hu et al, 2023)\n<a href='https://www.google.de' style="color:blue">Paper</a>""", |
| """GPTZero\n(Numo et al.)\n<a href='https://www.google.de' style="color:blue">WebApp</a>""", |
| ], |
| "All Gen. [F1]" : [0.723, 0.563, 0.736, 0.635, 0.464], |
| "Llama-2 7B Gen. [F1]" : [0.719, 0.636, 0.622, 0.722, 0.777], |
| "GPT-4 Gen. [F1]" : [0.673, 0.435, 0.362, 0.232, 0.562], |
| "GPT-3 Gen. [F1]" : [0.374, 0.335, 0.232, 0.632, 0.533], |
| "Falcon 7B Gen. [F1]" : [0.445, 0.454, 0.646, 0.665, 0.464] |
| }) |
|
|
|
|
| def process_csv_text(temp_file): |
| if isinstance(temp_file, str): |
| df = pd.read_csv(StringIO(temp_file), parse_dates=["Start date", "End date"]) |
| else: |
| df = pd.read_csv(temp_file.name, parse_dates=["Start date", "End date"]) |
| print(df) |
| return df |
| |
| numeric_cols = df.select_dtypes(include='number').columns |
| traget_cols = ['All Gen. [F1]', 'Llama-2 7B Gen. [F1]', 'GPT-4 Gen. [F1]'] |
|
|
| |
| |
| styler = df.style.set_properties(**{'text-align': 'center'},subset=['Detector'])\ |
| .highlight_max(color='lightgreen', axis=0, subset=traget_cols) |
| title = "Test" |
| desc = "Test gradio" |
| |
| with gr.Blocks() as demo: |
| |
|
|
| gr.Markdown( |
| """ |
| <div align="center"> |
| <center><h1>BUST: Benchmark for the evaluation of system detectors of LLM-Generated Text</h1></center> |
| <center> Welcome to BUST, a comprehensive benchmark for evaluating synthetic text detectors, focusing on their \ |
| effectiveness against outputs from various Large Language Models (LLMs). BUST evaluates detectors using a wide \ |
| range of metrics including linguistic features, readability, and writer attitudes, aiming to identify spurious \ |
| signals that may influence detection. The benchmark not only ranks detectors but also analyzes their performance \ |
| correlations with specific metrics and provides insights into biases and robustness against different LLM outputs. \ |
| BUST is designed as a dynamic resource, continuously updated to stay relevant in the rapidly evolving field of \ |
| LLM-generated content detection.</center> |
| <p> </p> |
| <p> </p> |
| </div> |
| """) |
| gr.DataFrame(styler, line_breaks=True, datatype="markdown") |
|
|
|
|
|
|
| |
| gr.Markdown( |
| """ |
| <center><h1>Download test dataset</h1></center> |
| <center>...</center> |
| """) |
|
|
|
|
| |
| gr.Markdown( |
| """ |
| <center><h1>Upload predictions</h1></center> |
| <center>Upload your predictions. Please make sure that the file is in the right format (.csv) and contains the "id" and "prediction" columns.</center> |
| """) |
| |
| upload_button = gr.UploadButton(label="Upload Predictions", file_types = ['.csv'], file_count = "single") |
| table = gr.Dataframe(headers=["Detector", "All Gen. [F1]", "Llama-2 7B Gen. [F1]", "GPT-4 Gen. [F1]","GPT-3 Gen. [F1]", "Falcon 7B Gen. [F1]"], type="pandas", col_count=6) |
|
|
| |
| upload_button.upload(fn=process_csv_text, inputs=upload_button, outputs=table, api_name="upload_csv") |
|
|
| |
| demo.launch() |
|
|