Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -125,7 +125,7 @@ def load_dataset_by_name(dataset_name, split="train"):
|
|
| 125 |
retry=retry_if_exception_type(Exception)
|
| 126 |
)
|
| 127 |
|
| 128 |
-
def get_model_response(question, options, prompt_template, model_name):
|
| 129 |
with semaphore:
|
| 130 |
try:
|
| 131 |
model_config = MODELS[model_name]
|
|
@@ -135,23 +135,21 @@ def get_model_response(question, options, prompt_template, model_name):
|
|
| 135 |
provider = model_config["provider"]
|
| 136 |
|
| 137 |
if provider == "togetherai":
|
| 138 |
-
response =
|
| 139 |
model=model_config["model_id"],
|
| 140 |
messages=[{"role": "user", "content": prompt}]
|
| 141 |
)
|
| 142 |
response_text = response.choices[0].message.content.strip()
|
| 143 |
|
| 144 |
elif provider == "openai":
|
| 145 |
-
response =
|
| 146 |
model=model_config["model_id"],
|
| 147 |
-
messages=[{
|
| 148 |
-
"role": "user",
|
| 149 |
-
"content": prompt}]
|
| 150 |
)
|
| 151 |
response_text = response.choices[0].message.content.strip()
|
| 152 |
|
| 153 |
elif provider == "anthropic":
|
| 154 |
-
response =
|
| 155 |
model=model_config["model_id"],
|
| 156 |
messages=[{"role": "user", "content": prompt}],
|
| 157 |
max_tokens=4096
|
|
@@ -189,12 +187,13 @@ def evaluate_response(model_response, correct_answer):
|
|
| 189 |
is_correct = model_response.lower().strip() == correct_answer.lower().strip()
|
| 190 |
return is_correct
|
| 191 |
|
| 192 |
-
def process_single_evaluation(question, prompt_template, model_name):
|
| 193 |
answer, response_text = get_model_response(
|
| 194 |
question['question'],
|
| 195 |
question['options'],
|
| 196 |
prompt_template,
|
| 197 |
-
model_name
|
|
|
|
| 198 |
)
|
| 199 |
is_correct = evaluate_response(answer, question['correct_answer'])
|
| 200 |
return {
|
|
@@ -210,7 +209,7 @@ def process_single_evaluation(question, prompt_template, model_name):
|
|
| 210 |
'model_name': model_name
|
| 211 |
}
|
| 212 |
|
| 213 |
-
def process_evaluations_concurrently(questions, prompt_template, models_to_evaluate, progress_callback):
|
| 214 |
results = []
|
| 215 |
total_iterations = len(models_to_evaluate) * len(questions)
|
| 216 |
current_iteration = 0
|
|
@@ -219,7 +218,7 @@ def process_evaluations_concurrently(questions, prompt_template, models_to_evalu
|
|
| 219 |
future_to_params = {}
|
| 220 |
for model_name in models_to_evaluate:
|
| 221 |
for question in questions:
|
| 222 |
-
future = executor.submit(process_single_evaluation, question, prompt_template, model_name)
|
| 223 |
future_to_params[future] = (model_name, question)
|
| 224 |
|
| 225 |
for future in as_completed(future_to_params):
|
|
@@ -318,6 +317,13 @@ Important:
|
|
| 318 |
with st.spinner("Starting evaluation..."):
|
| 319 |
selected_questions = questions[:num_questions]
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
progress_container = st.container()
|
| 322 |
progress_bar = progress_container.progress(0)
|
| 323 |
status_text = progress_container.empty()
|
|
@@ -326,12 +332,13 @@ Important:
|
|
| 326 |
progress = current / total
|
| 327 |
progress_bar.progress(progress)
|
| 328 |
status_text.text(f"Progress: {current}/{total} evaluations completed")
|
| 329 |
-
|
| 330 |
results = process_evaluations_concurrently(
|
| 331 |
selected_questions,
|
| 332 |
prompt_template,
|
| 333 |
models_to_evaluate,
|
| 334 |
-
update_progress
|
|
|
|
| 335 |
)
|
| 336 |
|
| 337 |
all_results = {}
|
|
|
|
| 125 |
retry=retry_if_exception_type(Exception)
|
| 126 |
)
|
| 127 |
|
| 128 |
+
def get_model_response(question, options, prompt_template, model_name, clients):
|
| 129 |
with semaphore:
|
| 130 |
try:
|
| 131 |
model_config = MODELS[model_name]
|
|
|
|
| 135 |
provider = model_config["provider"]
|
| 136 |
|
| 137 |
if provider == "togetherai":
|
| 138 |
+
response = clients["togetherai"].chat.completions.create(
|
| 139 |
model=model_config["model_id"],
|
| 140 |
messages=[{"role": "user", "content": prompt}]
|
| 141 |
)
|
| 142 |
response_text = response.choices[0].message.content.strip()
|
| 143 |
|
| 144 |
elif provider == "openai":
|
| 145 |
+
response = clients["openai"].chat.completions.create(
|
| 146 |
model=model_config["model_id"],
|
| 147 |
+
messages=[{"role": "user", "content": prompt}]
|
|
|
|
|
|
|
| 148 |
)
|
| 149 |
response_text = response.choices[0].message.content.strip()
|
| 150 |
|
| 151 |
elif provider == "anthropic":
|
| 152 |
+
response = clients["anthropic"].messages.create(
|
| 153 |
model=model_config["model_id"],
|
| 154 |
messages=[{"role": "user", "content": prompt}],
|
| 155 |
max_tokens=4096
|
|
|
|
| 187 |
is_correct = model_response.lower().strip() == correct_answer.lower().strip()
|
| 188 |
return is_correct
|
| 189 |
|
| 190 |
+
def process_single_evaluation(question, prompt_template, model_name, clients):
|
| 191 |
answer, response_text = get_model_response(
|
| 192 |
question['question'],
|
| 193 |
question['options'],
|
| 194 |
prompt_template,
|
| 195 |
+
model_name,
|
| 196 |
+
clients
|
| 197 |
)
|
| 198 |
is_correct = evaluate_response(answer, question['correct_answer'])
|
| 199 |
return {
|
|
|
|
| 209 |
'model_name': model_name
|
| 210 |
}
|
| 211 |
|
| 212 |
+
def process_evaluations_concurrently(questions, prompt_template, models_to_evaluate, progress_callback, clients):
|
| 213 |
results = []
|
| 214 |
total_iterations = len(models_to_evaluate) * len(questions)
|
| 215 |
current_iteration = 0
|
|
|
|
| 218 |
future_to_params = {}
|
| 219 |
for model_name in models_to_evaluate:
|
| 220 |
for question in questions:
|
| 221 |
+
future = executor.submit(process_single_evaluation, question, prompt_template, model_name, clients)
|
| 222 |
future_to_params[future] = (model_name, question)
|
| 223 |
|
| 224 |
for future in as_completed(future_to_params):
|
|
|
|
| 317 |
with st.spinner("Starting evaluation..."):
|
| 318 |
selected_questions = questions[:num_questions]
|
| 319 |
|
| 320 |
+
# Create a clients dictionary
|
| 321 |
+
clients = {
|
| 322 |
+
"togetherai": st.session_state["togetherai_client"],
|
| 323 |
+
"openai": st.session_state["openai_client"],
|
| 324 |
+
"anthropic": st.session_state["anthropic_client"]
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
progress_container = st.container()
|
| 328 |
progress_bar = progress_container.progress(0)
|
| 329 |
status_text = progress_container.empty()
|
|
|
|
| 332 |
progress = current / total
|
| 333 |
progress_bar.progress(progress)
|
| 334 |
status_text.text(f"Progress: {current}/{total} evaluations completed")
|
| 335 |
+
|
| 336 |
results = process_evaluations_concurrently(
|
| 337 |
selected_questions,
|
| 338 |
prompt_template,
|
| 339 |
models_to_evaluate,
|
| 340 |
+
update_progress,
|
| 341 |
+
clients
|
| 342 |
)
|
| 343 |
|
| 344 |
all_results = {}
|