Update app.py
Browse files
app.py
CHANGED
|
@@ -68,69 +68,24 @@ def load_line_json_data(filename):
|
|
| 68 |
def add_new_eval(
|
| 69 |
val_or_test: str,
|
| 70 |
eval_mode: str,
|
| 71 |
-
model: str,
|
| 72 |
-
tooluse_strategy: str,
|
| 73 |
-
planning_strategy: str,
|
| 74 |
-
organization: str,
|
| 75 |
-
mail: str,
|
| 76 |
path_to_file: str,
|
| 77 |
):
|
| 78 |
-
# Very basic email parsing
|
| 79 |
-
_, parsed_mail = parseaddr(mail)
|
| 80 |
-
if not "@" in parsed_mail:
|
| 81 |
-
return format_warning("Please provide a valid email adress.")
|
| 82 |
-
|
| 83 |
print("Adding new eval")
|
| 84 |
|
| 85 |
if path_to_file is None:
|
| 86 |
return format_warning("Please attach a file.")
|
| 87 |
|
| 88 |
-
# Save submitted file
|
| 89 |
-
api.upload_file(
|
| 90 |
-
repo_id=RESULTS_DATASET,
|
| 91 |
-
path_or_fileobj=path_to_file.name,
|
| 92 |
-
path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
|
| 93 |
-
repo_type="dataset",
|
| 94 |
-
token=TOKEN
|
| 95 |
-
)
|
| 96 |
|
| 97 |
# Compute score
|
| 98 |
file_path = path_to_file.name
|
| 99 |
-
result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
repo_id=RESULTS_DATASET,
|
| 106 |
-
path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl",
|
| 107 |
-
path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
|
| 108 |
-
repo_type="dataset",
|
| 109 |
-
token=TOKEN
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
-
# Actual submission
|
| 113 |
-
eval_entry = {
|
| 114 |
-
"Model": model,
|
| 115 |
-
"Tool-use Strategy": tooluse_strategy,
|
| 116 |
-
"Planning Strategy": planning_strategy,
|
| 117 |
-
"Organization": organization,
|
| 118 |
-
"Mail": mail,
|
| 119 |
-
"Delivery Rate": result['Delivery Rate'],
|
| 120 |
-
"Commonsense Constraint Micro Pass Rate":result['Commonsense Constraint Micro Pass Rate'],
|
| 121 |
-
"Commonsense Constraint Macro Pass Rate":result['Commonsense Constraint Macro Pass Rate'],
|
| 122 |
-
"Hard Constraint Micro Pass Rate":result['Hard Constraint Micro Pass Rate'],
|
| 123 |
-
"Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
|
| 124 |
-
"Final Pass Rate":result['Final Pass Rate']
|
| 125 |
-
}
|
| 126 |
-
eval_mode = eval_mode.replace('-','')
|
| 127 |
-
eval_results[f'{val_or_test}_{eval_mode}'] = eval_results[f'{val_or_test}_{eval_mode}'].add_item(eval_entry)
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
|
| 132 |
-
|
| 133 |
-
return format_log(f"Model: {model} | Tool-use Strategy: {tooluse_strategy} | Planning Strategy: {planning_strategy} | submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed (Validation ~2mins, Test ~7mins).")
|
| 134 |
|
| 135 |
|
| 136 |
def refresh():
|
|
@@ -141,9 +96,6 @@ def refresh():
|
|
| 141 |
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
|
| 142 |
return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning
|
| 143 |
|
| 144 |
-
# def upload_file(files):
|
| 145 |
-
# file_paths = [file.name for file in files]
|
| 146 |
-
# return file_paths
|
| 147 |
|
| 148 |
|
| 149 |
demo = gr.Blocks()
|
|
@@ -185,14 +137,8 @@ with demo:
|
|
| 185 |
with gr.Column():
|
| 186 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
| 187 |
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
planning_strategy = gr.Textbox(label="Planning Strategy")
|
| 191 |
-
with gr.Column():
|
| 192 |
-
organization = gr.Textbox(label="Organization")
|
| 193 |
-
mail = gr.Textbox(label="Contact email")
|
| 194 |
-
file_output = gr.File()
|
| 195 |
-
|
| 196 |
|
| 197 |
submit_button = gr.Button("Submit Eval")
|
| 198 |
submission_result = gr.Markdown()
|
|
@@ -201,16 +147,12 @@ with demo:
|
|
| 201 |
[
|
| 202 |
level_of_test,
|
| 203 |
eval_mode,
|
| 204 |
-
|
| 205 |
-
tooluse_strategy,
|
| 206 |
-
planning_strategy,
|
| 207 |
-
organization,
|
| 208 |
-
mail,
|
| 209 |
-
file_output,
|
| 210 |
],
|
| 211 |
-
submission_result,
|
| 212 |
)
|
| 213 |
|
| 214 |
demo.launch(debug=True)
|
| 215 |
|
| 216 |
|
|
|
|
|
|
| 68 |
def add_new_eval(
|
| 69 |
val_or_test: str,
|
| 70 |
eval_mode: str,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
path_to_file: str,
|
| 72 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
print("Adding new eval")
|
| 74 |
|
| 75 |
if path_to_file is None:
|
| 76 |
return format_warning("Please attach a file.")
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# Compute score
|
| 80 |
file_path = path_to_file.name
|
| 81 |
+
result, detail_json = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
|
| 82 |
+
print(detail_json)
|
| 83 |
+
print(type(detail_json))
|
| 84 |
+
outputPath=os.path.join('.',datetime.now().strftime('%Y%m%d%H%M%S') + '.json')
|
| 85 |
+
with open(outputPath,'w') as w:
|
| 86 |
+
json.dump(detail_json,w)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
return format_log(f"{result}"), gr.File(label=f"Download the detailed constraint pass rate reports", value=outputPath, visible=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
def refresh():
|
|
|
|
| 96 |
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
|
| 97 |
return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning
|
| 98 |
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
demo = gr.Blocks()
|
|
|
|
| 137 |
with gr.Column():
|
| 138 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
| 139 |
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
|
| 140 |
+
file_input = gr.File(label="Upload file")
|
| 141 |
+
file_output = gr.File(label="Download the detailed constraint pass rate reports", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
submit_button = gr.Button("Submit Eval")
|
| 144 |
submission_result = gr.Markdown()
|
|
|
|
| 147 |
[
|
| 148 |
level_of_test,
|
| 149 |
eval_mode,
|
| 150 |
+
file_input,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
],
|
| 152 |
+
[submission_result, file_output]
|
| 153 |
)
|
| 154 |
|
| 155 |
demo.launch(debug=True)
|
| 156 |
|
| 157 |
|
| 158 |
+
|