Spaces:

MaroueneA
/

llm-evaluation

Build error

File size: 7,342 Bytes

90ad0a1
 
 
 
 
a982ad4
 
90ad0a1
 
 
 
 
 
 
 
 
 
a982ad4
 
90ad0a1
 
 
 
 
 
a982ad4
90ad0a1
 
 
a982ad4
90ad0a1
 
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
 
 
 
a982ad4
 
 
 
 
90ad0a1
 
a982ad4
 
90ad0a1
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
 
 
 
 
 
a982ad4
 
 
 
 
90ad0a1
 
a982ad4
90ad0a1
a982ad4
 
 
 
 
 
 
 
90ad0a1
 
 
a982ad4
 
90ad0a1
 
a982ad4
 
 
90ad0a1
a982ad4
90ad0a1
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
a982ad4
 
90ad0a1
 
a982ad4
 
90ad0a1
 
 
a982ad4
 
 
 
 
 
90ad0a1
 
a982ad4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90ad0a1
a982ad4
90ad0a1
a982ad4
90ad0a1
 
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a982ad4
90ad0a1
a982ad4
90ad0a1
 
 
 
 
 
 
 
 
 
 
 
a982ad4
90ad0a1
a982ad4
90ad0a1

import gradio as gr
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials

SPREADSHEET_NAME = "dataset"    # The name of your Google Sheet
WORKSHEET_NAME = "sheet1"       # The tab/worksheet name
df = None
current_index = None
ws = None

def init_gsheets():
    """
    Authenticate with Google Sheets and load the entire worksheet into a DataFrame.
    We'll identify the next unreviewed row and store that in global variables.
    """
    global df, current_index, ws
    
    # Scopes for Google Sheets
    scope = [
        "https://spreadsheets.google.com/feeds",
        "https://www.googleapis.com/auth/spreadsheets",
        "https://www.googleapis.com/auth/drive"
    ]

    creds = ServiceAccountCredentials.from_json_keyfile_name("creds.json", scope)
    gc = gspread.authorize(creds)
    sh = gc.open(SPREADSHEET_NAME)
    ws = sh.worksheet(WORKSHEET_NAME)
    
    # Read all values from the sheet
    data = ws.get_all_values()
    df = pd.DataFrame(data[1:], columns=data[0])  # row 1 = headers

    # Identify first unreviewed row (example: "Human judges quality" is empty)
    unreviewed_rows = df[df["Human judges quality"] == ""]
    if len(unreviewed_rows) > 0:
        current_index = unreviewed_rows.index[0]
    else:
        current_index = None

def get_prompt_data():
    """
    Returns the current prompt and responses if any are left,
    or a "done" message if everything is reviewed.
    """
    global df, current_index
    if current_index is None:
        return "All rows have been reviewed.", "", "", True  # all_done = True
    
    row = df.loc[current_index]
    return row["Prompt"], row["LLM1 response"], row["LLM2 response"], False

def save_and_load_next(
    preference,
    factual_accuracy,
    relevance,
    llm1_issues,
    llm2_issues,
    llm1_tunisian_score,
    llm2_tunisian_score
):
    """
    1) Saves feedback for the current row to Google Sheets
    2) Moves to the next unreviewed row
    3) Returns the next prompt + updated status message in one step
    """
    global df, current_index, ws

    # If we're out of rows, just return "all done"
    if current_index is None:
        return (
            "All rows have been reviewed.",  # prompt
            "",                              # LLM1 resp
            "",                              # LLM2 resp
            "No more rows to review!"        # status message
        )
    
    # 1. Update the in-memory DataFrame
    df.at[current_index, "Human judges quality"] = preference
    df.at[current_index, "Human judges correctness"] = factual_accuracy
    df.at[current_index, "Human judges relevance"] = relevance
    df.at[current_index, "Human LLM1 flagged issues"] = ", ".join(llm1_issues) if llm1_issues else ""
    df.at[current_index, "Human LLM2 flagged issues"] = ", ".join(llm2_issues) if llm2_issues else ""
    df.at[current_index, "Human LLM1 Tunisian usage score"] = llm1_tunisian_score
    df.at[current_index, "Human LLM2 Tunisian usage score"] = llm2_tunisian_score
    
    # 2. Write updates back to Google Sheets
    sheet_row = current_index + 2  # row 1 is headers
    headers = list(df.columns)
    
    def update_sheet_cell(column_name, value):
        col_index = headers.index(column_name) + 1  # 1-based indexing
        ws.update_cell(sheet_row, col_index, value)
    
    update_sheet_cell("Human judges quality", preference)
    update_sheet_cell("Human judges correctness", factual_accuracy)
    update_sheet_cell("Human judges relevance", relevance)
    update_sheet_cell("Human LLM1 flagged issues", df.at[current_index, "Human LLM1 flagged issues"])
    update_sheet_cell("Human LLM2 flagged issues", df.at[current_index, "Human LLM2 flagged issues"])
    update_sheet_cell("Human LLM1 Tunisian usage score", llm1_tunisian_score)
    update_sheet_cell("Human LLM2 Tunisian usage score", llm2_tunisian_score)
    
    # 3. Move to the next unreviewed row
    unreviewed_rows = df[df["Human judges quality"] == ""]
    if len(unreviewed_rows) == 0:
        current_index = None
        return (
            "All rows have been reviewed.",  # prompt
            "",                              # LLM1 resp
            "",                              # LLM2 resp
            "All rows have been reviewed. Thank you!"  # status message
        )
    else:
        current_index = unreviewed_rows.index[0]
        prompt, llm1resp, llm2resp, _ = get_prompt_data()
        return (
            prompt,
            llm1resp,
            llm2resp,
            "Feedback saved! Moving to the next prompt..."
        )

def on_load():
    """
    Called on interface load. Returns the current prompt, or 'done' if there's none.
    """
    prompt, llm1resp, llm2resp, all_done = get_prompt_data()
    if all_done:
        return prompt, llm1resp, llm2resp, "No next prompt. All done."
    else:
        return prompt, llm1resp, llm2resp, ""

# ---- Initialize Google Sheets data
init_gsheets()

# ---- Build Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# LLM Responses Evaluation (Google Sheets)")

    prompt_text = gr.Textbox(label="Prompt", interactive=False)
    llm1_text = gr.Textbox(label="LLM1 Response", interactive=False)
    llm2_text = gr.Textbox(label="LLM2 Response", interactive=False)
    status_msg = gr.Markdown()

    preference = gr.Radio(
        ["LLM1", "LLM2", "Tie", "Both are bad"],
        label="Which response do you prefer?"
    )
    factual_accuracy = gr.Radio(
        ["LLM1", "LLM2", "Tie", "Both are bad"],
        label="Which response is more factually accurate?"
    )
    relevance = gr.Radio(
        ["LLM1", "LLM2", "Tie", "Both are bad"],
        label="Which response better addresses the prompt?"
    )

    llm1_issues = gr.CheckboxGroup(
        [
            "Hate Speech",
            "Not Arabic",
            "Inappropriate Content",
            "Sexual Content",
            "Untruthful Info",
            "Violent Content",
            "Personal Information"
        ],
        label="Does Response 1 contain any issues?"
    )
    llm2_issues = gr.CheckboxGroup(
        [
            "Hate Speech",
            "Not Arabic",
            "Inappropriate Content",
            "Sexual Content",
            "Untruthful Info",
            "Violent Content",
            "Personal Information"
        ],
        label="Does Response 2 contain any issues?"
    )

    llm1_tunisian_score = gr.Radio(
        [0, 1, 2],
        label="Rate LLM1's use of Tunisian Arabic",
        value=0
    )
    llm2_tunisian_score = gr.Radio(
        [0, 1, 2],
        label="Rate LLM2's use of Tunisian Arabic",
        value=0
    )

    submit_btn = gr.Button("Submit Feedback")

    # Single callback: save feedback and immediately load next prompt
    submit_btn.click(
        fn=save_and_load_next,
        inputs=[
            preference,
            factual_accuracy,
            relevance,
            llm1_issues,
            llm2_issues,
            llm1_tunisian_score,
            llm2_tunisian_score
        ],
        outputs=[prompt_text, llm1_text, llm2_text, status_msg]
    )

    # On initial load: display the first unreviewed prompt
    demo.load(
        fn=on_load,
        inputs=[],
        outputs=[prompt_text, llm1_text, llm2_text, status_msg]
    )

demo.launch()