Spaces:

MaroueneA
/

llm-evaluation

Build error

App Files Files Community

MaroueneA commited on Feb 1, 2025

Commit

bdddd07

verified ·

1 Parent(s): 8d44e85

Upload 3 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
app.py +237 -0
dataset.xlsx +3 -0
requirements.txt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+dataset.xlsx filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import gradio as gr
+import pandas as pd
+import os
+# Global variables
+DATASET_PATH = "dataset.xlsx"
+df = None
+current_index = None
+def load_dataset():
+    """Load the Excel dataset into a global pandas DataFrame."""
+    global df, current_index
+    if not os.path.exists(DATASET_PATH):
+        raise FileNotFoundError(f"Excel file not found at {DATASET_PATH}")
+    df = pd.read_excel(DATASET_PATH)
+    # Identify if there's any row that is "unreviewed".
+    # We'll consider a row unreviewed if 'Human judges quality' is NaN or empty.
+    # Adjust the column or logic as needed for your real use case.
+    unreviewed_rows = df[df['Human judges quality'].isna()]
+    if len(unreviewed_rows) == 0:
+        current_index = None  # Means no rows left to review
+    else:
+        # Pick the first unreviewed row
+        current_index = unreviewed_rows.index[0]
+def get_next_prompt():
+    """
+    Fetch the next unreviewed row from the DataFrame.
+    Return a dictionary of prompt data or indicate if all done.
+    """
+    global current_index, df
+    if current_index is None:
+        return {
+            "prompt": "All rows have been reviewed.",
+            "llm1_resp": "",
+            "llm2_resp": "",
+            "all_done": True
+        }
+    row = df.loc[current_index]
+    return {
+        "prompt": row["Prompt"],
+        "llm1_resp": row["LLM1 response"],
+        "llm2_resp": row["LLM2 response"],
+        "all_done": False
+    }
+def save_feedback(
+    preference,
+    factual_accuracy,
+    relevance,
+    llm1_issues,
+    llm2_issues,
+    llm1_tunisian_score,
+    llm2_tunisian_score
+):
+    """
+    Saves the feedback to the global DataFrame, writes it to disk,
+    and updates current_index to the next unreviewed row.
+    """
+    global df, current_index
+    if current_index is None:
+        return gr.update(value="No more rows to review!")
+    # Map the feedback to the columns in your dataset
+    # For example, "Human judges quality" could store the "preference".
+    df.at[current_index, "Human judges quality"] = preference
+    df.at[current_index, "Human judges correctness"] = factual_accuracy
+    df.at[current_index, "Human judges relevance"] = relevance
+    # Store flagged issues (you might want to store them as a comma-separated string)
+    df.at[current_index, "Human LLM1 flagged issues"] = ", ".join(
+        llm1_issues) if llm1_issues else ""
+    df.at[current_index, "Human LLM2 flagged issues"] = ", ".join(
+        llm2_issues) if llm2_issues else ""
+    # Store Tunisian Arabic usage scores
+    df.at[current_index, "Human LLM1 Tunisian usage score"] = llm1_tunisian_score
+    df.at[current_index, "Human LLM2 Tunisian usage score"] = llm2_tunisian_score
+    # Write back to Excel
+    df.to_excel(DATASET_PATH, index=False)
+    # Move to the next unreviewed row
+    next_unreviewed = df[df['Human judges quality'].isna()]
+    if len(next_unreviewed) == 0:
+        current_index = None
+        return gr.update(value="All rows have been reviewed. Thank you!")
+    else:
+        current_index = next_unreviewed.index[0]
+        return gr.update(value="Feedback saved! Moving to the next prompt...")
+def get_prompt_and_responses():
+    """
+    Retrieve the next prompt and responses from the DataFrame
+    and return them so they can be displayed in the interface.
+    """
+    data = get_next_prompt()
+    if data["all_done"]:
+        return (
+            data["prompt"],
+            data["llm1_resp"],
+            data["llm2_resp"],
+            "No next prompt. All done."
+        )
+    else:
+        return (
+            data["prompt"],
+            data["llm1_resp"],
+            data["llm2_resp"],
+            ""
+        )
+def refresh_ui():
+    """Helper to re-fetch the prompt data (e.g., after user feedback)."""
+    prompt, llm1_resp, llm2_resp, msg = get_prompt_and_responses()
+    return prompt, llm1_resp, llm2_resp, msg
+# Load the dataset once on startup
+load_dataset()
+with gr.Blocks() as demo:
+    gr.Markdown("# LLM Responses Evaluation")
+    # 1) Display the prompt and LLM responses
+    prompt_text = gr.Textbox(label="Prompt", interactive=False)
+    llm1_text = gr.Textbox(label="LLM1 Response", interactive=False)
+    llm2_text = gr.Textbox(label="LLM2 Response", interactive=False)
+    status_msg = gr.Markdown()
+    # 2) Radio for "Which response do you prefer?"
+    preference = gr.Radio(
+        ["LLM1", "LLM2", "Tie", "Both are bad"],
+        label="Which response do you prefer?",
+        value=None
+    )
+    # 3) Radio for "Which response is more factually accurate?"
+    factual_accuracy = gr.Radio(
+        ["LLM1", "LLM2", "Tie", "Both are bad"],
+        label="Which response is more factually accurate?",
+        value=None
+    )
+    # 4) Radio for "Which response better addresses the prompt?"
+    relevance = gr.Radio(
+        ["LLM1", "LLM2", "Tie", "Both are bad"],
+        label="Which response better addresses the prompt?",
+        value=None
+    )
+    # 5) Checkboxes for flagged issues in Response 1
+    llm1_issues = gr.CheckboxGroup(
+        [
+            "Hate Speech",
+            "Not Arabic",
+            "Inappropriate Content",
+            "Sexual Content",
+            "Untruthful Info",
+            "Violent Content",
+            "Personal Information"
+        ],
+        label="Does Response 1 contain any issues?"
+    )
+    # 6) Checkboxes for flagged issues in Response 2
+    llm2_issues = gr.CheckboxGroup(
+        [
+            "Hate Speech",
+            "Not Arabic",
+            "Inappropriate Content",
+            "Sexual Content",
+            "Untruthful Info",
+            "Violent Content",
+            "Personal Information"
+        ],
+        label="Does Response 2 contain any issues?"
+    )
+    # 7) Radio for LLM1's Tunisian Arabic usage score
+    llm1_tunisian_score = gr.Radio(
+        [0, 1, 2], label="Rate LLM1's use of Tunisian Arabic? 0: No Tunisian Arabic, 1: Mostly Tunisian Arabic, 2: Fully Tunisian Arabic", value=0)
+    # 8) Radio for LLM2's Tunisian Arabic usage score
+    llm2_tunisian_score = gr.Radio(
+        [0, 1, 2], label="Rate LLM2's use of Tunisian Arabic? 0: No Tunisian Arabic, 1: Mostly Tunisian Arabic, 2: Fully Tunisian Arabic", value=0)
+    # Submit button
+    submit_btn = gr.Button("Submit Feedback")
+    # On submit, save the feedback and show an update message
+    submit_btn.click(
+        fn=save_feedback,
+        inputs=[
+            preference,
+            factual_accuracy,
+            relevance,
+            llm1_issues,
+            llm2_issues,
+            llm1_tunisian_score,
+            llm2_tunisian_score
+        ],
+        outputs=status_msg
+    )
+    # Then auto-refresh the prompt/responses displayed
+    submit_btn.click(
+        fn=refresh_ui,
+        inputs=[],
+        outputs=[prompt_text, llm1_text, llm2_text, status_msg]
+    )
+    # Initialize with the first unreviewed row
+    demo.load(
+        fn=get_prompt_and_responses,
+        inputs=[],
+        outputs=[prompt_text, llm1_text, llm2_text, status_msg]
+    )
+# If you're running this locally, you'd do:
+demo.launch(share=True)
+# When uploading to HuggingFace Spaces, ensure you have a "requirements.txt"
+# with gradio, pandas, openpyxl so HF can build the environment.

dataset.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4999dbdc6db1e0e8ad10d69fa8f3966e80cd156e0f759d62569974d7055294a3
+size 567180

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio
+pandas
+openpyxl