Spaces:
Runtime error
Runtime error
added causal lm eval
Browse files- human_eval.py +109 -23
human_eval.py
CHANGED
|
@@ -33,6 +33,22 @@ def create_html_media(media_path, is_gif=False):
|
|
| 33 |
"""
|
| 34 |
return html_string
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
class LMBattleArena:
|
| 37 |
def __init__(self, dataset_path):
|
| 38 |
"""Initialize battle arena with dataset"""
|
|
@@ -40,23 +56,29 @@ class LMBattleArena:
|
|
| 40 |
print(self.df.head())
|
| 41 |
self.current_index = 0
|
| 42 |
self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
|
| 43 |
-
self.
|
|
|
|
| 44 |
self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
|
| 45 |
|
| 46 |
-
def get_next_battle_pair(self):
|
| 47 |
"""Retrieve next pair of summaries for comparison"""
|
| 48 |
if self.current_index >= len(self.df):
|
| 49 |
return None
|
| 50 |
|
| 51 |
row = self.df.iloc[self.current_index]
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
selected_models = random.sample(model_summary_cols, 2)
|
| 58 |
battle_data = {
|
| 59 |
-
'prompt': row['
|
| 60 |
'model_1': row[selected_models[0]],
|
| 61 |
'model_2': row[selected_models[1]],
|
| 62 |
'model1_name': selected_models[0],
|
|
@@ -65,7 +87,7 @@ class LMBattleArena:
|
|
| 65 |
self.current_index += 1
|
| 66 |
return battle_data
|
| 67 |
|
| 68 |
-
def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name):
|
| 69 |
"""Record user's model preference and update scores"""
|
| 70 |
self.model_scores[model1_name]['total_comparisons'] += 1
|
| 71 |
self.model_scores[model2_name]['total_comparisons'] += 1
|
|
@@ -87,14 +109,23 @@ class LMBattleArena:
|
|
| 87 |
'model2_name': model2_name,
|
| 88 |
'preferred_models': preferred_models
|
| 89 |
}
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
return self.get_model_scores_df()
|
| 93 |
|
| 94 |
-
def get_model_scores_df(self):
|
| 95 |
"""Convert model scores to DataFrame"""
|
| 96 |
scores_data = []
|
| 97 |
for model, stats in self.model_scores.items():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
|
| 99 |
scores_data.append({
|
| 100 |
'Model': model,
|
|
@@ -113,11 +144,11 @@ class LMBattleArena:
|
|
| 113 |
return results_df
|
| 114 |
|
| 115 |
|
| 116 |
-
def create_battle_arena(dataset_path, is_gif):
|
| 117 |
arena = LMBattleArena(dataset_path)
|
| 118 |
|
| 119 |
-
def battle_round():
|
| 120 |
-
battle_data = arena.get_next_battle_pair()
|
| 121 |
|
| 122 |
if battle_data is None:
|
| 123 |
return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
|
|
@@ -131,11 +162,11 @@ def create_battle_arena(dataset_path, is_gif):
|
|
| 131 |
gr.DataFrame(visible=True)
|
| 132 |
)
|
| 133 |
|
| 134 |
-
def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models):
|
| 135 |
scores_df = arena.record_evaluation(
|
| 136 |
-
preferred_models, input_text, output_1, output_2, model1_name, model2_name
|
| 137 |
)
|
| 138 |
-
next_battle = battle_round()
|
| 139 |
return (*next_battle[:-1], scores_df)
|
| 140 |
|
| 141 |
with gr.Blocks(css="footer{display:none !important}") as demo:
|
|
@@ -145,9 +176,60 @@ def create_battle_arena(dataset_path, is_gif):
|
|
| 145 |
gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
|
| 146 |
|
| 147 |
with gr.Tabs():
|
| 148 |
-
with gr.Tab("Battle Arena"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
gr.Markdown("# π€ Pretrained SmolLMs Battle Arena")
|
| 150 |
|
|
|
|
|
|
|
|
|
|
| 151 |
input_text = gr.Textbox(
|
| 152 |
label="Input prompt",
|
| 153 |
interactive=False,
|
|
@@ -180,18 +262,22 @@ def create_battle_arena(dataset_path, is_gif):
|
|
| 180 |
|
| 181 |
submit_btn.click(
|
| 182 |
submit_preference,
|
| 183 |
-
inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models],
|
| 184 |
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
| 185 |
)
|
| 186 |
|
| 187 |
-
demo.load(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
return demo
|
| 190 |
|
| 191 |
if __name__ == "__main__":
|
| 192 |
|
| 193 |
# load the existing dataset that contains outputs of the LMs
|
| 194 |
-
human_eval_dataset = load_dataset("atlasia/Moroccan-Darija-
|
| 195 |
|
| 196 |
# precision
|
| 197 |
torch_dtype = torch.float16
|
|
@@ -200,5 +286,5 @@ if __name__ == "__main__":
|
|
| 200 |
device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
|
| 201 |
dataset_path = 'human_eval_dataset.csv'
|
| 202 |
is_gif = True
|
| 203 |
-
demo = create_battle_arena(dataset_path, is_gif)
|
| 204 |
demo.launch(debug=True)
|
|
|
|
| 33 |
"""
|
| 34 |
return html_string
|
| 35 |
|
| 36 |
+
MASKED_LM_MODELS = [
|
| 37 |
+
"BounharAbdelaziz/XLM-RoBERTa-Morocco",
|
| 38 |
+
"SI2M-Lab/DarijaBERT",
|
| 39 |
+
"BounharAbdelaziz/ModernBERT-Morocco",
|
| 40 |
+
"google-bert/bert-base-multilingual-cased",
|
| 41 |
+
"FacebookAI/xlm-roberta-large",
|
| 42 |
+
"aubmindlab/bert-base-arabertv02",
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
CAUSAL_LM_MODELS = [
|
| 46 |
+
"BounharAbdelaziz/Al-Atlas-LLM-0.5B",
|
| 47 |
+
"Qwen/Qwen2.5-0.5B",
|
| 48 |
+
"tiiuae/Falcon3-1B-Base",
|
| 49 |
+
"MBZUAI-Paris/Atlas-Chat-2B",
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
class LMBattleArena:
|
| 53 |
def __init__(self, dataset_path):
|
| 54 |
"""Initialize battle arena with dataset"""
|
|
|
|
| 56 |
print(self.df.head())
|
| 57 |
self.current_index = 0
|
| 58 |
self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
|
| 59 |
+
self.evaluation_results_masked = []
|
| 60 |
+
self.evaluation_results_causal = []
|
| 61 |
self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
|
| 62 |
|
| 63 |
+
def get_next_battle_pair(self, is_causal):
|
| 64 |
"""Retrieve next pair of summaries for comparison"""
|
| 65 |
if self.current_index >= len(self.df):
|
| 66 |
return None
|
| 67 |
|
| 68 |
row = self.df.iloc[self.current_index]
|
| 69 |
+
if is_causal:
|
| 70 |
+
model_summary_cols = [
|
| 71 |
+
col
|
| 72 |
+
for col in CAUSAL_LM_MODELS
|
| 73 |
+
]
|
| 74 |
+
else:
|
| 75 |
+
model_summary_cols = [
|
| 76 |
+
col
|
| 77 |
+
for col in MASKED_LM_MODELS
|
| 78 |
+
]
|
| 79 |
selected_models = random.sample(model_summary_cols, 2)
|
| 80 |
battle_data = {
|
| 81 |
+
'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'],
|
| 82 |
'model_1': row[selected_models[0]],
|
| 83 |
'model_2': row[selected_models[1]],
|
| 84 |
'model1_name': selected_models[0],
|
|
|
|
| 87 |
self.current_index += 1
|
| 88 |
return battle_data
|
| 89 |
|
| 90 |
+
def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal):
|
| 91 |
"""Record user's model preference and update scores"""
|
| 92 |
self.model_scores[model1_name]['total_comparisons'] += 1
|
| 93 |
self.model_scores[model2_name]['total_comparisons'] += 1
|
|
|
|
| 109 |
'model2_name': model2_name,
|
| 110 |
'preferred_models': preferred_models
|
| 111 |
}
|
| 112 |
+
if is_causal:
|
| 113 |
+
self.evaluation_results_causal.append(evaluation)
|
| 114 |
+
else:
|
| 115 |
+
self.evaluation_results_masked.append(evaluation)
|
| 116 |
|
| 117 |
+
return self.get_model_scores_df(is_causal)
|
| 118 |
|
| 119 |
+
def get_model_scores_df(self, is_causal):
|
| 120 |
"""Convert model scores to DataFrame"""
|
| 121 |
scores_data = []
|
| 122 |
for model, stats in self.model_scores.items():
|
| 123 |
+
if is_causal:
|
| 124 |
+
if model not in CAUSAL_LM_MODELS:
|
| 125 |
+
continue
|
| 126 |
+
else:
|
| 127 |
+
if model not in MASKED_LM_MODELS:
|
| 128 |
+
continue
|
| 129 |
win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
|
| 130 |
scores_data.append({
|
| 131 |
'Model': model,
|
|
|
|
| 144 |
return results_df
|
| 145 |
|
| 146 |
|
| 147 |
+
def create_battle_arena(dataset_path, is_gif, is_causal):
|
| 148 |
arena = LMBattleArena(dataset_path)
|
| 149 |
|
| 150 |
+
def battle_round(is_causal):
|
| 151 |
+
battle_data = arena.get_next_battle_pair(is_causal)
|
| 152 |
|
| 153 |
if battle_data is None:
|
| 154 |
return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
|
|
|
|
| 162 |
gr.DataFrame(visible=True)
|
| 163 |
)
|
| 164 |
|
| 165 |
+
def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal):
|
| 166 |
scores_df = arena.record_evaluation(
|
| 167 |
+
preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal
|
| 168 |
)
|
| 169 |
+
next_battle = battle_round(is_causal)
|
| 170 |
return (*next_battle[:-1], scores_df)
|
| 171 |
|
| 172 |
with gr.Blocks(css="footer{display:none !important}") as demo:
|
|
|
|
| 176 |
gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
|
| 177 |
|
| 178 |
with gr.Tabs():
|
| 179 |
+
with gr.Tab("Masked LM Battle Arena"):
|
| 180 |
+
gr.Markdown("# π€ Pretrained SmolLMs Battle Arena")
|
| 181 |
+
|
| 182 |
+
# Use gr.State to store the boolean value without displaying it
|
| 183 |
+
is_causal = gr.State(value=False)
|
| 184 |
+
|
| 185 |
+
input_text = gr.Textbox(
|
| 186 |
+
label="Input prompt",
|
| 187 |
+
interactive=False,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
with gr.Row():
|
| 191 |
+
output_1 = gr.Textbox(
|
| 192 |
+
label="Model A",
|
| 193 |
+
interactive=False
|
| 194 |
+
)
|
| 195 |
+
model1_name = gr.State() # Hidden state for model1 name
|
| 196 |
+
|
| 197 |
+
with gr.Row():
|
| 198 |
+
output_2 = gr.Textbox(
|
| 199 |
+
label="Model B",
|
| 200 |
+
interactive=False
|
| 201 |
+
)
|
| 202 |
+
model2_name = gr.State() # Hidden state for model2 name
|
| 203 |
+
|
| 204 |
+
preferred_models = gr.Radio(
|
| 205 |
+
label="Which model is better?",
|
| 206 |
+
choices=["Model A", "Model B", "Both Good", "Both Bad"]
|
| 207 |
+
)
|
| 208 |
+
submit_btn = gr.Button("Vote", variant="primary")
|
| 209 |
+
|
| 210 |
+
scores_table = gr.DataFrame(
|
| 211 |
+
headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
|
| 212 |
+
label="π Leaderboard"
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
submit_btn.click(
|
| 216 |
+
submit_preference,
|
| 217 |
+
inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
|
| 218 |
+
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
demo.load(
|
| 222 |
+
battle_round,
|
| 223 |
+
inputs=[is_causal],
|
| 224 |
+
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
with gr.Tab("Causal LM Battle Arena"):
|
| 228 |
gr.Markdown("# π€ Pretrained SmolLMs Battle Arena")
|
| 229 |
|
| 230 |
+
# Use gr.State to store the boolean value without displaying it
|
| 231 |
+
is_causal = gr.State(value=True)
|
| 232 |
+
|
| 233 |
input_text = gr.Textbox(
|
| 234 |
label="Input prompt",
|
| 235 |
interactive=False,
|
|
|
|
| 262 |
|
| 263 |
submit_btn.click(
|
| 264 |
submit_preference,
|
| 265 |
+
inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal],
|
| 266 |
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
| 267 |
)
|
| 268 |
|
| 269 |
+
demo.load(
|
| 270 |
+
battle_round,
|
| 271 |
+
inputs=[is_causal],
|
| 272 |
+
outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
|
| 273 |
+
)
|
| 274 |
|
| 275 |
return demo
|
| 276 |
|
| 277 |
if __name__ == "__main__":
|
| 278 |
|
| 279 |
# load the existing dataset that contains outputs of the LMs
|
| 280 |
+
human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test').to_csv('human_eval_dataset.csv')
|
| 281 |
|
| 282 |
# precision
|
| 283 |
torch_dtype = torch.float16
|
|
|
|
| 286 |
device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
|
| 287 |
dataset_path = 'human_eval_dataset.csv'
|
| 288 |
is_gif = True
|
| 289 |
+
demo = create_battle_arena(dataset_path, is_gif, is_causal=False)
|
| 290 |
demo.launch(debug=True)
|