marouene addhoum
I AM LOST
31e3f57
import gradio as gr
import pandas as pd
import os
import shutil
import time
# File paths
# Reference dataset
initial_file = './Updated_HumanEvaluations_columns_and_rows_shuffled.xlsx'
storage_dir = './data' # Local storage directory for the updated dataset
# The dataset being modified
storage_file = f'{storage_dir}/Updated_HumanEvaluations.xlsx'
# βœ… Ensure the directory exists
os.makedirs(storage_dir, exist_ok=True)
print("πŸ“‚ Checking if dataset exists:", os.path.exists(storage_file))
print("πŸ“‚ Using dataset from:", storage_file)
if not os.path.exists(storage_file):
print("⚠️ No existing dataset found, copying from reference dataset...")
shutil.copy(initial_file, storage_file)
# Load dataset with explicit column types
df = pd.read_excel(storage_file, dtype={
'Human judges quality': str,
'Human judges correctness': str,
'Human judges relevance': str,
'Human LLM1 flagged issues': str,
'Human LLM2 flagged issues': str,
'Human LLM1 Tunisian usage score': str,
'Human LLM2 Tunisian usage score': str
})
print("βœ… Dataset loaded successfully!")
print(df.head()) # βœ… Print the first few rows to verify
def get_next_prompt():
global df
print("πŸ”„ Checking for next unfilled prompt...")
# βœ… Reload dataset to ensure latest version is used
df = pd.read_excel(storage_file)
for index, row in df.iterrows():
print(f"πŸ” Checking row {index} for missing values...")
if pd.isna(row['Human judges quality']) or pd.isna(row['Human judges correctness']) or pd.isna(row['Human judges relevance']) or pd.isna(row['Human LLM1 Tunisian usage score']) or pd.isna(row['Human LLM2 Tunisian usage score']) or pd.isna(row['Human LLM1 flagged issues']) or pd.isna(row['Human LLM2 flagged issues']):
print(f"βœ… Found next unfilled prompt at index {index}")
return row['Prompt'], row['LLM1 response'], row['LLM2 response']
print("⚠️ No more unfilled prompts!")
return "No more prompts!", "", ""
def submit_feedback(prefer, accuracy, relevance, llm1_flags, llm2_flags, llm1_tunisian, llm2_tunisian):
global df
print("🟑 Receiving feedback submission...")
for index, row in df.iterrows():
if pd.isna(row['Human judges quality']) or pd.isna(row['Human judges correctness']) or pd.isna(row['Human judges relevance']) or pd.isna(row['Human LLM1 Tunisian usage score']) or pd.isna(row['Human LLM2 Tunisian usage score']) or pd.isna(row['Human LLM1 flagged issues']) or pd.isna(row['Human LLM2 flagged issues']):
print(f"🟒 Updating row index {index} with feedback...")
# βœ… Convert values explicitly before saving
df.at[index, 'Human judges quality'] = str(
prefer) if prefer else "N/A"
df.at[index, 'Human judges correctness'] = str(
accuracy) if accuracy else "N/A"
df.at[index, 'Human judges relevance'] = str(
relevance) if relevance else "N/A"
df.at[index, 'Human LLM1 flagged issues'] = ", ".join(
llm1_flags) if llm1_flags else "None"
df.at[index, 'Human LLM2 flagged issues'] = ", ".join(
llm2_flags) if llm2_flags else "None"
df.at[index, 'Human LLM1 Tunisian usage score'] = int(
llm1_tunisian) if llm1_tunisian else 0
df.at[index, 'Human LLM2 Tunisian usage score'] = int(
llm2_tunisian) if llm2_tunisian else 0
try:
# βœ… Ensure all NaN values are replaced before saving
df.fillna("N/A", inplace=True)
# βœ… Save dataset with explicit encoding
df.to_excel(storage_file, index=False, engine="openpyxl")
time.sleep(1) # βœ… Allow time for saving
print("βœ… Dataset saved successfully at:", storage_file)
# βœ… Reload dataset to verify update
df = pd.read_excel(storage_file)
print("πŸ”„ Reloaded dataset preview:")
# βœ… Show the updated row to confirm changes
print(df.iloc[index])
except Exception as e:
print("❌ ERROR Saving File:", str(e))
break # βœ… Move to the next prompt after saving
return get_next_prompt()
with gr.Blocks() as demo:
gr.Markdown("## LLM Response Evaluation")
prompt_output = gr.Textbox(label="Prompt", interactive=False)
response1_output = gr.Textbox(label="Response 1", interactive=False)
response2_output = gr.Textbox(label="Response 2", interactive=False)
prefer = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"],
label="Which response do you prefer?")
accuracy = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"],
label="Which response is more factually accurate?")
relevance = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"],
label="Which response better addresses the prompt?")
llm1_flags = gr.CheckboxGroup(["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content",
"Untruthful Info", "Violent Content", "Personal Information"], label="Does Response 1 contain any issues?")
llm2_flags = gr.CheckboxGroup(["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content",
"Untruthful Info", "Violent Content", "Personal Information"], label="Does Response 2 contain any issues?")
llm1_tunisian = gr.Radio(
[0, 1, 2], label="Rate LLM1's use of Tunisian Arabic")
llm2_tunisian = gr.Radio(
[0, 1, 2], label="Rate LLM2's use of Tunisian Arabic")
submit_btn = gr.Button("Submit and Get New Prompt")
submit_btn.click(submit_feedback, inputs=[prefer, accuracy, relevance, llm1_flags, llm2_flags,
llm1_tunisian, llm2_tunisian], outputs=[prompt_output, response1_output, response2_output])
demo.load(get_next_prompt, outputs=[
prompt_output, response1_output, response2_output])
if __name__ == "__main__":
demo.launch()