Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -64,7 +64,7 @@ def retry_api_request(max_retries=3, wait_time=10):
|
|
| 64 |
|
| 65 |
# --- Single model request function for Hugging Face ---
|
| 66 |
@retry_api_request()
|
| 67 |
-
def make_hf_request(model_name, messages, temperature, max_tokens):
|
| 68 |
"""
|
| 69 |
Send request to a Hugging Face model using InferenceClient
|
| 70 |
|
|
@@ -73,11 +73,12 @@ def make_hf_request(model_name, messages, temperature, max_tokens):
|
|
| 73 |
messages: Messages in the format [{"role": "user", "content": "..."}]
|
| 74 |
temperature: Temperature parameter for generation
|
| 75 |
max_tokens: Maximum tokens to generate
|
|
|
|
| 76 |
|
| 77 |
Returns:
|
| 78 |
Generated text or None if request fails
|
| 79 |
"""
|
| 80 |
-
client = InferenceClient(model=model_name)
|
| 81 |
|
| 82 |
# Convert messages to a prompt string
|
| 83 |
prompt = ""
|
|
@@ -304,7 +305,6 @@ def rank_answer_prompt(question, answer, topic):
|
|
| 304 |
3: Good answer - clear, relevant to the topic, well-formulated, with correct statements. For creative writing, this includes demonstrating good originality, imagination, and adherence to the prompt, including the 3000-character limit.
|
| 305 |
4: Very good answer - very clear, very relevant to the topic, expertly formulated, with highly correct statements. For creative writing, shows strong originality, a compelling narrative or poetic voice, and excellent adherence to the prompt, including the 3000-character limit.
|
| 306 |
5: Exceptionally good answer - only appliable to exceptional answers that match all the criteria of the previous "4: Very good answer", but also bring additional unique insights, perfectly sound original arguments, or other exceptional unexpected contributions to the topic. For creative writing, this indicates a truly outstanding piece of writing with exceptional creativity, emotional resonance, and masterful execution, while adhering to the 3000-character limit.
|
| 307 |
-
|
| 308 |
Consider these criteria in your ranking:
|
| 309 |
- Clarity: Is the answer easy to understand? Is it ambiguous or confusing?
|
| 310 |
- Relevance: Is the answer relevant to the specified topic?
|
|
@@ -323,7 +323,6 @@ Consider these criteria in your ranking:
|
|
| 323 |
|
| 324 |
prompt += f"""
|
| 325 |
Just return a single number (the rank from 1 to 5), do not add any other text.
|
| 326 |
-
|
| 327 |
Question: {question}
|
| 328 |
Answer: {answer}
|
| 329 |
Rank:"""
|
|
@@ -345,7 +344,6 @@ def rank_question_prompt(question, topic, difficulty):
|
|
| 345 |
3: Good question - clear, relevant to the topic, generally appropriate for the difficulty level, and reasonably well-formulated. For creative writing, the prompt is clear, provides a reasonable starting point for creative work, and sets a clear 3000-character limit.
|
| 346 |
4: Very good question - clear, highly relevant to the topic, appropriate for the difficulty level, and well-formulated. For creative writing, the prompt is engaging, sparks imagination, and offers a good balance of direction and freedom, with a clear 3000-character limit.
|
| 347 |
5: Excellent question - exceptionally clear, insightful, highly relevant to the topic, perfectly matched to the difficulty level, and expertly formulated. For creative writing, the prompt is exceptionally creative, thought-provoking, and likely to inspire high-quality writing, with a clear 3000-character limit.
|
| 348 |
-
|
| 349 |
Consider these criteria in your ranking:
|
| 350 |
- Clarity: Is the question easy to understand? Is it ambiguous or confusing?
|
| 351 |
- Relevance: Is the question relevant to the specified topic ({topic})?
|
|
@@ -361,7 +359,6 @@ Consider these criteria in your ranking:
|
|
| 361 |
"""
|
| 362 |
prompt += f"""
|
| 363 |
Just return a single number (the rank from 1 to 5), do not add any other text.
|
| 364 |
-
|
| 365 |
Question: {question}
|
| 366 |
Rank:"""
|
| 367 |
return prompt
|
|
@@ -385,14 +382,14 @@ def parse_rank_string(rank_str, ranking_model_id):
|
|
| 385 |
return None
|
| 386 |
|
| 387 |
# --- Helper Function for Parallel Ranking ---
|
| 388 |
-
def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, timeout=60):
|
| 389 |
start_time = time.time()
|
| 390 |
rank = None # Initialize rank to None, indicating potential failure
|
| 391 |
|
| 392 |
rank_prompt = rank_answer_prompt(question, answer, topic)
|
| 393 |
|
| 394 |
try:
|
| 395 |
-
response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5)
|
| 396 |
if response:
|
| 397 |
try:
|
| 398 |
rank_str = response.strip()
|
|
@@ -416,14 +413,14 @@ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures
|
|
| 416 |
return ranking_model_id, rank
|
| 417 |
|
| 418 |
# --- Helper Function for Parallel Ranking of questions ---
|
| 419 |
-
def get_question_rank_from_model(ranking_model_id, question, topic, difficulty, consecutive_failures, failure_threshold, unresponsive_models, model_config, timeout=60):
|
| 420 |
start_time = time.time()
|
| 421 |
rank = None # Initialize rank to None, indicating potential failure
|
| 422 |
|
| 423 |
rank_prompt = rank_question_prompt(question, topic, difficulty) # Use question rank prompt
|
| 424 |
|
| 425 |
try:
|
| 426 |
-
response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5)
|
| 427 |
if response:
|
| 428 |
try:
|
| 429 |
rank_str = response.strip()
|
|
@@ -447,7 +444,7 @@ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty,
|
|
| 447 |
return ranking_model_id, rank
|
| 448 |
|
| 449 |
# --- Helper Function for Parallel Answering ---
|
| 450 |
-
def get_answer_from_model(model_id, question, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, timeout=60):
|
| 451 |
start_time = time.time() # Start timer
|
| 452 |
answer_prompt = answer_question_prompt(question)
|
| 453 |
answer = "Error answering" # Default answer
|
|
@@ -459,7 +456,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
|
|
| 459 |
max_tok = long_max_tokens
|
| 460 |
|
| 461 |
try:
|
| 462 |
-
response = make_hf_request(model_config[model_id]["name"], [{"role": "user", "content": answer_prompt}], temp, max_tok)
|
| 463 |
if response:
|
| 464 |
answer = response.strip()
|
| 465 |
except Exception as e:
|
|
@@ -475,7 +472,7 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
|
|
| 475 |
return answer, duration # Return answer and duration
|
| 476 |
|
| 477 |
# --- Core Logic ---
|
| 478 |
-
def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
| 479 |
results = {
|
| 480 |
"model_name": [],
|
| 481 |
"topic": [],
|
|
@@ -563,7 +560,8 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
|
| 563 |
response = make_hf_request(model_config[question_generator_model_id]["name"],
|
| 564 |
[{"role": "user", "content": question_prompt}],
|
| 565 |
question_temp,
|
| 566 |
-
question_max_tokens
|
|
|
|
| 567 |
|
| 568 |
if response:
|
| 569 |
question = response.strip()
|
|
@@ -603,6 +601,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
|
| 603 |
failure_threshold,
|
| 604 |
unresponsive_models,
|
| 605 |
model_config,
|
|
|
|
| 606 |
timeout=60
|
| 607 |
)
|
| 608 |
question_ranking_futures.append(future)
|
|
@@ -667,6 +666,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
|
| 667 |
unresponsive_models,
|
| 668 |
model_config,
|
| 669 |
topic,
|
|
|
|
| 670 |
timeout=60
|
| 671 |
)
|
| 672 |
answer_futures.append(future)
|
|
@@ -726,6 +726,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config):
|
|
| 726 |
unresponsive_models,
|
| 727 |
model_config,
|
| 728 |
topic,
|
|
|
|
| 729 |
timeout=60
|
| 730 |
)
|
| 731 |
ranking_futures.append(future)
|
|
@@ -870,12 +871,6 @@ if st.sidebar.button("Start Benchmark"):
|
|
| 870 |
if 'results_df' not in st.session_state:
|
| 871 |
st.session_state.results_df = pd.DataFrame()
|
| 872 |
|
| 873 |
-
# Modify make_hf_request to use the token
|
| 874 |
-
def make_hf_request_with_token(model_name, messages, temperature, max_tokens):
|
| 875 |
-
client = InferenceClient(model=model_name, token=hf_token)
|
| 876 |
-
# Rest of the function is the same...
|
| 877 |
-
# Return response
|
| 878 |
-
|
| 879 |
# Run the benchmark
|
| 880 |
try:
|
| 881 |
# Update status
|
|
@@ -885,7 +880,7 @@ if st.sidebar.button("Start Benchmark"):
|
|
| 885 |
results, cumulative_avg_rank, total_successful = run_benchmark(
|
| 886 |
selected_models, selected_topics,
|
| 887 |
["a very simple", "a simple", "a", "a difficult", "a very difficult"],
|
| 888 |
-
num_iterations, model_config
|
| 889 |
)
|
| 890 |
|
| 891 |
# Update progress to complete
|
|
|
|
| 64 |
|
| 65 |
# --- Single model request function for Hugging Face ---
|
| 66 |
@retry_api_request()
|
| 67 |
+
def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
|
| 68 |
"""
|
| 69 |
Send request to a Hugging Face model using InferenceClient
|
| 70 |
|
|
|
|
| 73 |
messages: Messages in the format [{"role": "user", "content": "..."}]
|
| 74 |
temperature: Temperature parameter for generation
|
| 75 |
max_tokens: Maximum tokens to generate
|
| 76 |
+
token: Hugging Face API token
|
| 77 |
|
| 78 |
Returns:
|
| 79 |
Generated text or None if request fails
|
| 80 |
"""
|
| 81 |
+
client = InferenceClient(model=model_name, token=token)
|
| 82 |
|
| 83 |
# Convert messages to a prompt string
|
| 84 |
prompt = ""
|
|
|
|
| 305 |
3: Good answer - clear, relevant to the topic, well-formulated, with correct statements. For creative writing, this includes demonstrating good originality, imagination, and adherence to the prompt, including the 3000-character limit.
|
| 306 |
4: Very good answer - very clear, very relevant to the topic, expertly formulated, with highly correct statements. For creative writing, shows strong originality, a compelling narrative or poetic voice, and excellent adherence to the prompt, including the 3000-character limit.
|
| 307 |
5: Exceptionally good answer - only appliable to exceptional answers that match all the criteria of the previous "4: Very good answer", but also bring additional unique insights, perfectly sound original arguments, or other exceptional unexpected contributions to the topic. For creative writing, this indicates a truly outstanding piece of writing with exceptional creativity, emotional resonance, and masterful execution, while adhering to the 3000-character limit.
|
|
|
|
| 308 |
Consider these criteria in your ranking:
|
| 309 |
- Clarity: Is the answer easy to understand? Is it ambiguous or confusing?
|
| 310 |
- Relevance: Is the answer relevant to the specified topic?
|
|
|
|
| 323 |
|
| 324 |
prompt += f"""
|
| 325 |
Just return a single number (the rank from 1 to 5), do not add any other text.
|
|
|
|
| 326 |
Question: {question}
|
| 327 |
Answer: {answer}
|
| 328 |
Rank:"""
|
|
|
|
| 344 |
3: Good question - clear, relevant to the topic, generally appropriate for the difficulty level, and reasonably well-formulated. For creative writing, the prompt is clear, provides a reasonable starting point for creative work, and sets a clear 3000-character limit.
|
| 345 |
4: Very good question - clear, highly relevant to the topic, appropriate for the difficulty level, and well-formulated. For creative writing, the prompt is engaging, sparks imagination, and offers a good balance of direction and freedom, with a clear 3000-character limit.
|
| 346 |
5: Excellent question - exceptionally clear, insightful, highly relevant to the topic, perfectly matched to the difficulty level, and expertly formulated. For creative writing, the prompt is exceptionally creative, thought-provoking, and likely to inspire high-quality writing, with a clear 3000-character limit.
|
|
|
|
| 347 |
Consider these criteria in your ranking:
|
| 348 |
- Clarity: Is the question easy to understand? Is it ambiguous or confusing?
|
| 349 |
- Relevance: Is the question relevant to the specified topic ({topic})?
|
|
|
|
| 359 |
"""
|
| 360 |
prompt += f"""
|
| 361 |
Just return a single number (the rank from 1 to 5), do not add any other text.
|
|
|
|
| 362 |
Question: {question}
|
| 363 |
Rank:"""
|
| 364 |
return prompt
|
|
|
|
| 382 |
return None
|
| 383 |
|
| 384 |
# --- Helper Function for Parallel Ranking ---
|
| 385 |
+
def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, token=None, timeout=60):
|
| 386 |
start_time = time.time()
|
| 387 |
rank = None # Initialize rank to None, indicating potential failure
|
| 388 |
|
| 389 |
rank_prompt = rank_answer_prompt(question, answer, topic)
|
| 390 |
|
| 391 |
try:
|
| 392 |
+
response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5, token=token)
|
| 393 |
if response:
|
| 394 |
try:
|
| 395 |
rank_str = response.strip()
|
|
|
|
| 413 |
return ranking_model_id, rank
|
| 414 |
|
| 415 |
# --- Helper Function for Parallel Ranking of questions ---
|
| 416 |
+
def get_question_rank_from_model(ranking_model_id, question, topic, difficulty, consecutive_failures, failure_threshold, unresponsive_models, model_config, token=None, timeout=60):
|
| 417 |
start_time = time.time()
|
| 418 |
rank = None # Initialize rank to None, indicating potential failure
|
| 419 |
|
| 420 |
rank_prompt = rank_question_prompt(question, topic, difficulty) # Use question rank prompt
|
| 421 |
|
| 422 |
try:
|
| 423 |
+
response = make_hf_request(model_config[ranking_model_id]["name"], [{"role": "user", "content": rank_prompt}], base_temp, 5, token=token)
|
| 424 |
if response:
|
| 425 |
try:
|
| 426 |
rank_str = response.strip()
|
|
|
|
| 444 |
return ranking_model_id, rank
|
| 445 |
|
| 446 |
# --- Helper Function for Parallel Answering ---
|
| 447 |
+
def get_answer_from_model(model_id, question, consecutive_failures, failure_threshold, unresponsive_models, model_config, topic, token=None, timeout=60):
|
| 448 |
start_time = time.time() # Start timer
|
| 449 |
answer_prompt = answer_question_prompt(question)
|
| 450 |
answer = "Error answering" # Default answer
|
|
|
|
| 456 |
max_tok = long_max_tokens
|
| 457 |
|
| 458 |
try:
|
| 459 |
+
response = make_hf_request(model_config[model_id]["name"], [{"role": "user", "content": answer_prompt}], temp, max_tok, token=token)
|
| 460 |
if response:
|
| 461 |
answer = response.strip()
|
| 462 |
except Exception as e:
|
|
|
|
| 472 |
return answer, duration # Return answer and duration
|
| 473 |
|
| 474 |
# --- Core Logic ---
|
| 475 |
+
def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
| 476 |
results = {
|
| 477 |
"model_name": [],
|
| 478 |
"topic": [],
|
|
|
|
| 560 |
response = make_hf_request(model_config[question_generator_model_id]["name"],
|
| 561 |
[{"role": "user", "content": question_prompt}],
|
| 562 |
question_temp,
|
| 563 |
+
question_max_tokens,
|
| 564 |
+
token=token)
|
| 565 |
|
| 566 |
if response:
|
| 567 |
question = response.strip()
|
|
|
|
| 601 |
failure_threshold,
|
| 602 |
unresponsive_models,
|
| 603 |
model_config,
|
| 604 |
+
token,
|
| 605 |
timeout=60
|
| 606 |
)
|
| 607 |
question_ranking_futures.append(future)
|
|
|
|
| 666 |
unresponsive_models,
|
| 667 |
model_config,
|
| 668 |
topic,
|
| 669 |
+
token,
|
| 670 |
timeout=60
|
| 671 |
)
|
| 672 |
answer_futures.append(future)
|
|
|
|
| 726 |
unresponsive_models,
|
| 727 |
model_config,
|
| 728 |
topic,
|
| 729 |
+
token,
|
| 730 |
timeout=60
|
| 731 |
)
|
| 732 |
ranking_futures.append(future)
|
|
|
|
| 871 |
if 'results_df' not in st.session_state:
|
| 872 |
st.session_state.results_df = pd.DataFrame()
|
| 873 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 874 |
# Run the benchmark
|
| 875 |
try:
|
| 876 |
# Update status
|
|
|
|
| 880 |
results, cumulative_avg_rank, total_successful = run_benchmark(
|
| 881 |
selected_models, selected_topics,
|
| 882 |
["a very simple", "a simple", "a", "a difficult", "a very difficult"],
|
| 883 |
+
num_iterations, model_config, hf_token
|
| 884 |
)
|
| 885 |
|
| 886 |
# Update progress to complete
|