MaroueneA commited on
Commit
07de731
·
verified ·
1 Parent(s): e5f3c05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -142
app.py CHANGED
@@ -1,142 +1,118 @@
1
- import gradio as gr
2
- from datasets import load_dataset, Dataset
3
- import pandas as pd
4
- import os
5
- from huggingface_hub import HfApi
6
-
7
- # Read the Hugging Face token from the environment variable
8
- HF_TOKEN = os.environ.get("HF_TOKEN")
9
-
10
- # Authenticate with Hugging Face
11
- api = HfApi(token=HF_TOKEN)
12
-
13
- # Load the dataset from Hugging Face
14
- try:
15
- # Replace with your dataset file
16
- dataset = load_dataset("MaroueneA/feedback-dataset",
17
- data_files="dataset.csv")
18
- df = dataset["train"].to_pandas()
19
- if "CurrentPromptIndex" not in df.columns:
20
- df["CurrentPromptIndex"] = 0 # Initialize the column
21
- except Exception as e:
22
- print(f"Error loading dataset: {e}")
23
- df = pd.DataFrame(columns=[
24
- "Prompt", "LLM1 Response", "LLM2 Response", "Human judges quality", "Human judges correctness",
25
- "Human judges relevance", "Human LLM1 Tunisian usage score", "Human LLM2 Tunisian usage score",
26
- "Human LLM1 flagged issues", "Human LLM2 flagged issues", "Evaluated", "CurrentPromptIndex"
27
- ])
28
-
29
- # Function to save feedback to the dataset
30
-
31
-
32
- def save_feedback(prompt_idx, preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian):
33
- # Update the dataset with feedback
34
- df.at[prompt_idx, "Human judges quality"] = str(preference)
35
- df.at[prompt_idx, "Human judges correctness"] = str(factual_accuracy)
36
- df.at[prompt_idx, "Human judges relevance"] = str(relevance)
37
- df.at[prompt_idx, "Human LLM1 Tunisian usage score"] = int(llm1_tunisian)
38
- df.at[prompt_idx, "Human LLM2 Tunisian usage score"] = int(llm2_tunisian)
39
- df.at[prompt_idx, "Human LLM1 flagged issues"] = ", ".join(llm1_issues)
40
- df.at[prompt_idx, "Human LLM2 flagged issues"] = ", ".join(llm2_issues)
41
- df.at[prompt_idx, "Evaluated"] = True
42
-
43
- # Convert the updated DataFrame back to a Hugging Face Dataset
44
- updated_dataset = Dataset.from_pandas(df)
45
-
46
- # Push the updated dataset back to Hugging Face
47
- updated_dataset.push_to_hub("MaroueneA/feedback-dataset", token=HF_TOKEN)
48
-
49
- # Function to get the next unevaluated prompt
50
-
51
-
52
- def get_next_prompt():
53
- # Get the current prompt index from the dataset
54
- current_prompt_idx = df["CurrentPromptIndex"].iloc[0]
55
- # Iterate through the DataFrame to find the next unevaluated prompt
56
- for idx in range(current_prompt_idx, len(df)):
57
- if not df.at[idx, "Evaluated"]:
58
- # Update the current prompt index in the dataset
59
- df.at[0, "CurrentPromptIndex"] = idx
60
- return df.iloc[idx]
61
- return None
62
-
63
-
64
- # Initialize the first prompt
65
- current_prompt = get_next_prompt()
66
- if current_prompt is not None:
67
- initial_prompt = current_prompt["Prompt"]
68
- initial_llm1 = current_prompt["LLM1 response"]
69
- initial_llm2 = current_prompt["LLM2 response"]
70
- else:
71
- initial_prompt = "No prompts available."
72
- initial_llm1 = ""
73
- initial_llm2 = ""
74
-
75
- # Function to submit feedback and get the next prompt
76
-
77
-
78
- def submit_feedback(preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian):
79
- # Get the next unevaluated prompt
80
- next_prompt = get_next_prompt()
81
- if next_prompt is None:
82
- return "No more prompts available.", "", "", "Feedback saved successfully!"
83
-
84
- # Save feedback to the dataset
85
- save_feedback(df["CurrentPromptIndex"].iloc[0], preference, factual_accuracy,
86
- relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian)
87
-
88
- # Increment the prompt index and save it to the dataset
89
- df.at[0, "CurrentPromptIndex"] += 1
90
- print(f"Updated Prompt Index: {df['CurrentPromptIndex'].iloc[0]}")
91
-
92
- # Return the next prompt and responses
93
- return next_prompt["Prompt"], next_prompt["LLM1 response"], next_prompt["LLM2 response"], "Feedback saved successfully!"
94
-
95
-
96
- # Gradio interface
97
- with gr.Blocks() as demo:
98
- with gr.Row():
99
- prompt = gr.Textbox(
100
- label="Prompt", value=initial_prompt, interactive=False)
101
- with gr.Row():
102
- llm1_response = gr.Textbox(
103
- label="LLM1 Response", value=initial_llm1, interactive=False)
104
- llm2_response = gr.Textbox(
105
- label="LLM2 Response", value=initial_llm2, interactive=False)
106
- with gr.Row():
107
- preference = gr.Radio(
108
- ["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response do you prefer?")
109
- factual_accuracy = gr.Radio(
110
- ["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response is more factually accurate?")
111
- relevance = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"],
112
- label="Which response better addresses the prompt?")
113
- with gr.Row():
114
- llm1_issues = gr.CheckboxGroup(
115
- ["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content",
116
- "Untruthful Info", "Violent Content", "Personal Information"],
117
- label="Does Response 1 contain any issues?"
118
- )
119
- llm2_issues = gr.CheckboxGroup(
120
- ["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content",
121
- "Untruthful Info", "Violent Content", "Personal Information"],
122
- label="Does Response 2 contain any issues?"
123
- )
124
- with gr.Row():
125
- llm1_tunisian = gr.Radio(
126
- [0, 1, 2], label="Rate LLM1's use of Tunisian Arabic")
127
- llm2_tunisian = gr.Radio(
128
- [0, 1, 2], label="Rate LLM2's use of Tunisian Arabic")
129
- with gr.Row():
130
- submit_btn = gr.Button("Submit Feedback and Next Prompt")
131
-
132
- # Submit feedback and load the next prompt
133
- submit_btn.click(
134
- submit_feedback,
135
- inputs=[preference, factual_accuracy, relevance,
136
- llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian],
137
- outputs=[prompt, llm1_response,
138
- llm2_response, gr.Textbox(label="Status")]
139
- )
140
-
141
- # Launch the app
142
- demo.launch()
 
1
+ import gradio as gr
2
+ from datasets import load_dataset, Dataset
3
+ import pandas as pd
4
+ import os
5
+ from huggingface_hub import HfApi
6
+
7
+ # Read the Hugging Face token from the environment variable
8
+ HF_TOKEN = os.environ.get("HF_TOKEN")
9
+
10
+ # Authenticate with Hugging Face
11
+ api = HfApi(token=HF_TOKEN)
12
+
13
+ # Load the dataset from Hugging Face
14
+ try:
15
+ dataset = load_dataset("MaroueneA/feedback-dataset", data_files="dataset.csv") # Replace with your dataset file
16
+ df = dataset["train"].to_pandas()
17
+ if "CurrentPromptIndex" not in df.columns:
18
+ df["CurrentPromptIndex"] = 0 # Initialize the column as an integer
19
+ except Exception as e:
20
+ print(f"Error loading dataset: {e}")
21
+ df = pd.DataFrame(columns=[
22
+ "Prompt", "LLM1 Response", "LLM2 Response", "Human judges quality", "Human judges correctness",
23
+ "Human judges relevance", "Human LLM1 Tunisian usage score", "Human LLM2 Tunisian usage score",
24
+ "Human LLM1 flagged issues", "Human LLM2 flagged issues", "Evaluated", "CurrentPromptIndex"
25
+ ])
26
+
27
+ # Function to save feedback to the dataset
28
+ def save_feedback(prompt_idx, preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian):
29
+ # Update the dataset with feedback
30
+ df.at[prompt_idx, "Human judges quality"] = str(preference)
31
+ df.at[prompt_idx, "Human judges correctness"] = str(factual_accuracy)
32
+ df.at[prompt_idx, "Human judges relevance"] = str(relevance)
33
+ df.at[prompt_idx, "Human LLM1 Tunisian usage score"] = int(llm1_tunisian)
34
+ df.at[prompt_idx, "Human LLM2 Tunisian usage score"] = int(llm2_tunisian)
35
+ df.at[prompt_idx, "Human LLM1 flagged issues"] = ", ".join(llm1_issues)
36
+ df.at[prompt_idx, "Human LLM2 flagged issues"] = ", ".join(llm2_issues)
37
+ df.at[prompt_idx, "Evaluated"] = True
38
+
39
+ # Convert the updated DataFrame back to a Hugging Face Dataset
40
+ updated_dataset = Dataset.from_pandas(df)
41
+
42
+ # Push the updated dataset back to Hugging Face
43
+ updated_dataset.push_to_hub("MaroueneA/feedback-dataset", token=HF_TOKEN)
44
+
45
+ # Function to get the next unevaluated prompt
46
+ def get_next_prompt():
47
+ # Get the current prompt index from the dataset and convert it to an integer
48
+ current_prompt_idx = int(df["CurrentPromptIndex"].iloc[0])
49
+ # Iterate through the DataFrame to find the next unevaluated prompt
50
+ for idx in range(current_prompt_idx, len(df)):
51
+ if not df.at[idx, "Evaluated"]:
52
+ df.at[0, "CurrentPromptIndex"] = idx # Update the current prompt index in the dataset
53
+ return df.iloc[idx]
54
+ return None
55
+
56
+ # Initialize the first prompt
57
+ current_prompt = get_next_prompt()
58
+ if current_prompt is not None:
59
+ initial_prompt = current_prompt["Prompt"]
60
+ initial_llm1 = current_prompt["LLM1 response"]
61
+ initial_llm2 = current_prompt["LLM2 response"]
62
+ else:
63
+ initial_prompt = "No prompts available."
64
+ initial_llm1 = ""
65
+ initial_llm2 = ""
66
+
67
+ # Function to submit feedback and get the next prompt
68
+ def submit_feedback(preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian):
69
+ # Get the next unevaluated prompt
70
+ next_prompt = get_next_prompt()
71
+ if next_prompt is None:
72
+ return "No more prompts available.", "", "", "Feedback saved successfully!"
73
+
74
+ # Save feedback to the dataset
75
+ save_feedback(df["CurrentPromptIndex"].iloc[0], preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian)
76
+
77
+ # Increment the prompt index and save it to the dataset
78
+ df.at[0, "CurrentPromptIndex"] += 1
79
+ print(f"Updated Prompt Index: {df['CurrentPromptIndex'].iloc[0]}")
80
+
81
+ # Return the next prompt and responses
82
+ return next_prompt["Prompt"], next_prompt["LLM1 response"], next_prompt["LLM2 response"], "Feedback saved successfully!"
83
+
84
+ # Gradio interface
85
+ with gr.Blocks() as demo:
86
+ with gr.Row():
87
+ prompt = gr.Textbox(label="Prompt", value=initial_prompt, interactive=False)
88
+ with gr.Row():
89
+ llm1_response = gr.Textbox(label="LLM1 Response", value=initial_llm1, interactive=False)
90
+ llm2_response = gr.Textbox(label="LLM2 Response", value=initial_llm2, interactive=False)
91
+ with gr.Row():
92
+ preference = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response do you prefer?")
93
+ factual_accuracy = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response is more factually accurate?")
94
+ relevance = gr.Radio(["LLM1", "LLM2", "Tie", "Both are bad"], label="Which response better addresses the prompt?")
95
+ with gr.Row():
96
+ llm1_issues = gr.CheckboxGroup(
97
+ ["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"],
98
+ label="Does Response 1 contain any issues?"
99
+ )
100
+ llm2_issues = gr.CheckboxGroup(
101
+ ["Hate Speech", "Not Arabic", "Inappropriate Content", "Sexual Content", "Untruthful Info", "Violent Content", "Personal Information"],
102
+ label="Does Response 2 contain any issues?"
103
+ )
104
+ with gr.Row():
105
+ llm1_tunisian = gr.Radio([0, 1, 2], label="Rate LLM1's use of Tunisian Arabic")
106
+ llm2_tunisian = gr.Radio([0, 1, 2], label="Rate LLM2's use of Tunisian Arabic")
107
+ with gr.Row():
108
+ submit_btn = gr.Button("Submit Feedback and Next Prompt")
109
+
110
+ # Submit feedback and load the next prompt
111
+ submit_btn.click(
112
+ submit_feedback,
113
+ inputs=[preference, factual_accuracy, relevance, llm1_issues, llm2_issues, llm1_tunisian, llm2_tunisian],
114
+ outputs=[prompt, llm1_response, llm2_response, gr.Textbox(label="Status")]
115
+ )
116
+
117
+ # Launch the app
118
+ demo.launch()