Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,13 +12,17 @@ import sys
|
|
| 12 |
import io
|
| 13 |
from contextlib import redirect_stdout, redirect_stderr
|
| 14 |
|
| 15 |
-
# Initialize session state variables
|
| 16 |
if 'main_output' not in st.session_state:
|
| 17 |
-
st.session_state
|
| 18 |
if 'debug_output' not in st.session_state:
|
| 19 |
-
st.session_state
|
| 20 |
if 'progress' not in st.session_state:
|
| 21 |
-
st.session_state
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# FILES
|
| 24 |
iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
|
|
@@ -55,21 +59,15 @@ def custom_print(*args, **kwargs):
|
|
| 55 |
output = ' '.join(map(str, args))
|
| 56 |
|
| 57 |
# Add to main output list
|
| 58 |
-
st.session_state
|
| 59 |
|
| 60 |
# Also print to standard output for console logging
|
| 61 |
print(*args, **kwargs)
|
| 62 |
-
|
| 63 |
-
# Force an immediate update of the UI (when used inside a function)
|
| 64 |
-
st.session_state.update_counter = st.session_state.get('update_counter', 0) + 1
|
| 65 |
|
| 66 |
# Custom function to capture warnings and errors
|
| 67 |
def log_debug(message):
|
| 68 |
-
st.session_state
|
| 69 |
print(f"DEBUG: {message}", file=sys.stderr)
|
| 70 |
-
|
| 71 |
-
# Force an immediate update of the UI
|
| 72 |
-
st.session_state.update_counter = st.session_state.get('update_counter', 0) + 1
|
| 73 |
|
| 74 |
def retry_api_request(max_retries=3, wait_time=10):
|
| 75 |
"""Decorator for retrying API requests with rate limit handling."""
|
|
@@ -511,6 +509,8 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
|
|
| 511 |
|
| 512 |
# --- Core Logic ---
|
| 513 |
def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
|
|
|
|
|
| 514 |
results = {
|
| 515 |
"model_name": [],
|
| 516 |
"topic": [],
|
|
@@ -561,7 +561,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
| 561 |
|
| 562 |
for iteration in range(t): # Added iteration counter
|
| 563 |
# Update progress in the Streamlit app
|
| 564 |
-
st.session_state
|
| 565 |
|
| 566 |
if len(active_models) < 2:
|
| 567 |
custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
|
|
@@ -647,8 +647,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
| 647 |
question_ranking_futures.append(future)
|
| 648 |
|
| 649 |
for future in concurrent.futures.as_completed(question_ranking_futures): # Collect ranks as they become available
|
| 650 |
-
|
| 651 |
-
|
|
|
|
|
|
|
|
|
|
| 652 |
|
| 653 |
question_ranking_end_time = time.time()
|
| 654 |
question_ranking_duration_total = question_ranking_end_time - question_ranking_start_time
|
|
@@ -674,11 +677,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
| 674 |
|
| 675 |
if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
|
| 676 |
question_accepted = True
|
| 677 |
-
custom_print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks
|
| 678 |
s_t += 1
|
| 679 |
else:
|
| 680 |
question_accepted = False
|
| 681 |
-
custom_print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks
|
| 682 |
|
| 683 |
if not question_accepted:
|
| 684 |
custom_print("Generated question was not accepted. Regenerating question.")
|
|
@@ -709,35 +712,43 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
| 709 |
token,
|
| 710 |
timeout=60
|
| 711 |
)
|
| 712 |
-
answer_futures.append(future)
|
| 713 |
-
except
|
| 714 |
-
log_debug(f"
|
| 715 |
-
answer = "
|
| 716 |
-
duration =
|
| 717 |
-
answers[model_id] = answer
|
| 718 |
-
answer_durations[model_id] = duration
|
| 719 |
-
|
| 720 |
-
for future in
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
|
| 726 |
# --- Ranking Process ---
|
| 727 |
|
| 728 |
# Prepare to write to file (open in append mode outside the model loop but inside iteration loop)
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 732 |
|
| 733 |
|
| 734 |
for model_id in active_models:
|
| 735 |
-
|
| 736 |
-
if not answer: # Add guard clause
|
| 737 |
log_debug(f"No answer found for model {model_id}. Skipping ranking.")
|
| 738 |
continue
|
| 739 |
|
| 740 |
-
|
|
|
|
| 741 |
consecutive_failures[model_id] += 1
|
| 742 |
if consecutive_failures[model_id] >= failure_threshold:
|
| 743 |
custom_print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
|
|
@@ -759,24 +770,30 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
| 759 |
for ranking_model_id in active_models:
|
| 760 |
# --- Filter for ranking roles ("rank" or "both") ---
|
| 761 |
if model_config[ranking_model_id].get("role", "both") in ["rank", "both"]:
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
|
|
|
|
|
|
|
|
|
| 776 |
|
| 777 |
for future in concurrent.futures.as_completed(ranking_futures): # Collect ranks as they become available
|
| 778 |
-
|
| 779 |
-
|
|
|
|
|
|
|
|
|
|
| 780 |
|
| 781 |
ranking_end_time = time.time() # Record end time of ranking
|
| 782 |
ranking_duration = ranking_end_time - ranking_start_time # Calculate duration
|
|
@@ -807,20 +824,29 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
| 807 |
results["answer"].append(answer)
|
| 808 |
results["answer_generation_duration"].append(answer_durations.get(model_id, 0))
|
| 809 |
results["average_rank"].append(average_rank)
|
| 810 |
-
results["ranks"].append([ranks
|
| 811 |
results["question_rank_average"].append(question_avg_rank) # Store question rank average
|
| 812 |
-
results["question_ranks"].append([question_ranks
|
| 813 |
results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
|
| 814 |
|
| 815 |
-
|
| 816 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
|
| 818 |
# --- Print and store iteration results IMMEDIATELY after ranking for this model ---
|
| 819 |
-
ranks_str = "[" + ", ".join(map(str, [ranks
|
| 820 |
-
custom_print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank
|
| 821 |
|
| 822 |
# Write iteration results to file (append mode) - write for each model right after ranking
|
| 823 |
-
iteration_results_file_opened
|
|
|
|
|
|
|
|
|
|
|
|
|
| 824 |
|
| 825 |
# Update model weights based on cumulative average ranks, handling NaNs
|
| 826 |
temp_weights = {}
|
|
@@ -842,9 +868,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
| 842 |
for m_id in active_models:
|
| 843 |
model_weights[m_id] = 1.0 / len(active_models)
|
| 844 |
|
| 845 |
-
iteration_results_file_opened
|
|
|
|
| 846 |
|
| 847 |
custom_print(f"Unresponsive models during this run: {unresponsive_models}")
|
|
|
|
| 848 |
return results, cumulative_avg_rank, s_t
|
| 849 |
|
| 850 |
def check_model_availability(models, token):
|
|
@@ -970,14 +998,14 @@ with tab1:
|
|
| 970 |
st.error("You need at least 2 available models to run the benchmark")
|
| 971 |
|
| 972 |
# Progress bar
|
| 973 |
-
progress_bar = st.progress(st.session_state
|
| 974 |
|
| 975 |
# Start benchmark button
|
| 976 |
if st.sidebar.button("Start Benchmark"):
|
| 977 |
# Clear previous outputs
|
| 978 |
-
st.session_state
|
| 979 |
-
st.session_state
|
| 980 |
-
st.session_state
|
| 981 |
|
| 982 |
if not hf_token:
|
| 983 |
st.error("Please enter your Hugging Face API token")
|
|
@@ -989,10 +1017,6 @@ with tab1:
|
|
| 989 |
# Setup to capture results for display
|
| 990 |
results_container = st.container()
|
| 991 |
|
| 992 |
-
# Create a global variable to store intermediate results
|
| 993 |
-
if 'results_df' not in st.session_state:
|
| 994 |
-
st.session_state.results_df = pd.DataFrame()
|
| 995 |
-
|
| 996 |
# Run the benchmark
|
| 997 |
try:
|
| 998 |
# Run benchmark and get results
|
|
@@ -1003,19 +1027,19 @@ with tab1:
|
|
| 1003 |
)
|
| 1004 |
|
| 1005 |
# Update progress to complete
|
| 1006 |
-
st.session_state
|
| 1007 |
progress_bar.progress(1.0)
|
| 1008 |
|
| 1009 |
# Display results
|
| 1010 |
if total_successful > 0:
|
| 1011 |
results_df = pd.DataFrame(results)
|
| 1012 |
-
st.session_state
|
| 1013 |
|
| 1014 |
# Show model rankings
|
| 1015 |
st.subheader("Model Rankings")
|
| 1016 |
ranking_df = pd.DataFrame({
|
| 1017 |
"Model": list(cumulative_avg_rank.keys()),
|
| 1018 |
-
"Average Rank": [round(r, 2) for r in cumulative_avg_rank.values()]
|
| 1019 |
})
|
| 1020 |
ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
|
| 1021 |
st.dataframe(ranking_df)
|
|
@@ -1039,35 +1063,42 @@ with tab1:
|
|
| 1039 |
st.exception(e)
|
| 1040 |
|
| 1041 |
# Show previous results if available
|
| 1042 |
-
elif 'results_df' in st.session_state and not st.session_state
|
| 1043 |
st.subheader("Previous Results")
|
| 1044 |
-
st.dataframe(st.session_state
|
| 1045 |
|
| 1046 |
with tab2:
|
| 1047 |
# Display main output log
|
| 1048 |
st.subheader("Execution Log")
|
| 1049 |
|
| 1050 |
# Display logs
|
| 1051 |
-
|
| 1052 |
-
|
|
|
|
|
|
|
|
|
|
| 1053 |
|
| 1054 |
# Add a refresh button for the log
|
| 1055 |
if st.button("Refresh Progress Log"):
|
| 1056 |
-
|
| 1057 |
|
| 1058 |
with tab3:
|
| 1059 |
# Display debug output
|
| 1060 |
st.subheader("Debug Log")
|
| 1061 |
|
| 1062 |
# Display debug logs
|
| 1063 |
-
|
| 1064 |
-
|
|
|
|
|
|
|
|
|
|
| 1065 |
|
| 1066 |
# Add a refresh button for the debug log
|
| 1067 |
if st.button("Refresh Debug Log"):
|
| 1068 |
-
|
| 1069 |
-
|
| 1070 |
-
# Auto-
|
| 1071 |
-
if st.session_state.get('
|
| 1072 |
-
|
| 1073 |
-
|
|
|
|
|
|
| 12 |
import io
|
| 13 |
from contextlib import redirect_stdout, redirect_stderr
|
| 14 |
|
| 15 |
+
# Initialize session state variables properly at the very beginning
|
| 16 |
if 'main_output' not in st.session_state:
|
| 17 |
+
st.session_state['main_output'] = []
|
| 18 |
if 'debug_output' not in st.session_state:
|
| 19 |
+
st.session_state['debug_output'] = []
|
| 20 |
if 'progress' not in st.session_state:
|
| 21 |
+
st.session_state['progress'] = 0
|
| 22 |
+
if 'results_df' not in st.session_state:
|
| 23 |
+
st.session_state['results_df'] = pd.DataFrame()
|
| 24 |
+
if 'is_running' not in st.session_state:
|
| 25 |
+
st.session_state['is_running'] = False
|
| 26 |
|
| 27 |
# FILES
|
| 28 |
iteration_output_file = "llm_benchmark_iteration_results.csv" # File to store iteration results, defined as global
|
|
|
|
| 59 |
output = ' '.join(map(str, args))
|
| 60 |
|
| 61 |
# Add to main output list
|
| 62 |
+
st.session_state['main_output'].append(output)
|
| 63 |
|
| 64 |
# Also print to standard output for console logging
|
| 65 |
print(*args, **kwargs)
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
# Custom function to capture warnings and errors
|
| 68 |
def log_debug(message):
|
| 69 |
+
st.session_state['debug_output'].append(message)
|
| 70 |
print(f"DEBUG: {message}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def retry_api_request(max_retries=3, wait_time=10):
|
| 73 |
"""Decorator for retrying API requests with rate limit handling."""
|
|
|
|
| 509 |
|
| 510 |
# --- Core Logic ---
|
| 511 |
def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
| 512 |
+
st.session_state['is_running'] = True
|
| 513 |
+
|
| 514 |
results = {
|
| 515 |
"model_name": [],
|
| 516 |
"topic": [],
|
|
|
|
| 561 |
|
| 562 |
for iteration in range(t): # Added iteration counter
|
| 563 |
# Update progress in the Streamlit app
|
| 564 |
+
st.session_state['progress'] = (iteration + 1) / t
|
| 565 |
|
| 566 |
if len(active_models) < 2:
|
| 567 |
custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
|
|
|
|
| 647 |
question_ranking_futures.append(future)
|
| 648 |
|
| 649 |
for future in concurrent.futures.as_completed(question_ranking_futures): # Collect ranks as they become available
|
| 650 |
+
try:
|
| 651 |
+
ranking_model_id, rank = future.result() # Get model_id and rank
|
| 652 |
+
question_ranks[ranking_model_id] = rank # Store rank with model_id as key
|
| 653 |
+
except Exception as e:
|
| 654 |
+
log_debug(f"Error getting question rank result: {e}")
|
| 655 |
|
| 656 |
question_ranking_end_time = time.time()
|
| 657 |
question_ranking_duration_total = question_ranking_end_time - question_ranking_start_time
|
|
|
|
| 677 |
|
| 678 |
if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
|
| 679 |
question_accepted = True
|
| 680 |
+
custom_print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks.get(m, None) for m in active_models if m in question_ranks]}")
|
| 681 |
s_t += 1
|
| 682 |
else:
|
| 683 |
question_accepted = False
|
| 684 |
+
custom_print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks.get(m, None) for m in active_models if m in question_ranks]}")
|
| 685 |
|
| 686 |
if not question_accepted:
|
| 687 |
custom_print("Generated question was not accepted. Regenerating question.")
|
|
|
|
| 712 |
token,
|
| 713 |
timeout=60
|
| 714 |
)
|
| 715 |
+
answer_futures.append((model_id, future))
|
| 716 |
+
except Exception as e:
|
| 717 |
+
log_debug(f"Error submitting answer task for {model_id}: {e}")
|
| 718 |
+
answer = "Error answering - Task submission failed"
|
| 719 |
+
duration = 0
|
| 720 |
+
answers[model_id] = answer
|
| 721 |
+
answer_durations[model_id] = duration
|
| 722 |
+
|
| 723 |
+
for model_id, future in answer_futures:
|
| 724 |
+
try:
|
| 725 |
+
answer, duration = future.result() # Get both answer and duration
|
| 726 |
+
answers[model_id] = answer
|
| 727 |
+
answer_durations[model_id] = duration
|
| 728 |
+
except Exception as e:
|
| 729 |
+
log_debug(f"Error getting answer from {model_id}: {e}")
|
| 730 |
+
answers[model_id] = "Error answering - Future result failed"
|
| 731 |
+
answer_durations[model_id] = 0
|
| 732 |
|
| 733 |
# --- Ranking Process ---
|
| 734 |
|
| 735 |
# Prepare to write to file (open in append mode outside the model loop but inside iteration loop)
|
| 736 |
+
try:
|
| 737 |
+
iteration_results_file_opened = open(iteration_output_file, 'a')
|
| 738 |
+
if iteration == 0: # Write header only for the first iteration
|
| 739 |
+
iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
|
| 740 |
+
except Exception as e:
|
| 741 |
+
log_debug(f"Error opening results file: {e}")
|
| 742 |
+
iteration_results_file_opened = None
|
| 743 |
|
| 744 |
|
| 745 |
for model_id in active_models:
|
| 746 |
+
if model_id not in answers:
|
|
|
|
| 747 |
log_debug(f"No answer found for model {model_id}. Skipping ranking.")
|
| 748 |
continue
|
| 749 |
|
| 750 |
+
answer = answers[model_id]
|
| 751 |
+
if answer == "Error answering" or answer.startswith("Error answering -"): # Handle answer generation errors
|
| 752 |
consecutive_failures[model_id] += 1
|
| 753 |
if consecutive_failures[model_id] >= failure_threshold:
|
| 754 |
custom_print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
|
|
|
|
| 770 |
for ranking_model_id in active_models:
|
| 771 |
# --- Filter for ranking roles ("rank" or "both") ---
|
| 772 |
if model_config[ranking_model_id].get("role", "both") in ["rank", "both"]:
|
| 773 |
+
try:
|
| 774 |
+
future = executor.submit(
|
| 775 |
+
get_rank_from_model,
|
| 776 |
+
ranking_model_id,
|
| 777 |
+
question,
|
| 778 |
+
answer,
|
| 779 |
+
consecutive_failures,
|
| 780 |
+
failure_threshold,
|
| 781 |
+
unresponsive_models,
|
| 782 |
+
model_config,
|
| 783 |
+
topic,
|
| 784 |
+
token,
|
| 785 |
+
timeout=60
|
| 786 |
+
)
|
| 787 |
+
ranking_futures.append(future)
|
| 788 |
+
except Exception as e:
|
| 789 |
+
log_debug(f"Error submitting ranking task for {ranking_model_id}: {e}")
|
| 790 |
|
| 791 |
for future in concurrent.futures.as_completed(ranking_futures): # Collect ranks as they become available
|
| 792 |
+
try:
|
| 793 |
+
ranking_model_id, rank = future.result() # Get model_id and rank
|
| 794 |
+
ranks[ranking_model_id] = rank # Store rank with model_id as key
|
| 795 |
+
except Exception as e:
|
| 796 |
+
log_debug(f"Error getting rank result: {e}")
|
| 797 |
|
| 798 |
ranking_end_time = time.time() # Record end time of ranking
|
| 799 |
ranking_duration = ranking_end_time - ranking_start_time # Calculate duration
|
|
|
|
| 824 |
results["answer"].append(answer)
|
| 825 |
results["answer_generation_duration"].append(answer_durations.get(model_id, 0))
|
| 826 |
results["average_rank"].append(average_rank)
|
| 827 |
+
results["ranks"].append([ranks.get(m, None) for m in active_models if m in ranks]) # Store raw ranks including Nones, ensure order
|
| 828 |
results["question_rank_average"].append(question_avg_rank) # Store question rank average
|
| 829 |
+
results["question_ranks"].append([question_ranks.get(m, None) for m in active_models if m in question_ranks]) # Store question ranks
|
| 830 |
results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
|
| 831 |
|
| 832 |
+
if model_id in cumulative_model_ranks:
|
| 833 |
+
cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
|
| 834 |
+
|
| 835 |
+
if model_id in cumulative_model_ranks and cumulative_model_ranks[model_id]:
|
| 836 |
+
cumulative_avg_rank[model_id] = np.nanmean([r for r in cumulative_model_ranks[model_id] if not np.isnan(r)])
|
| 837 |
+
else:
|
| 838 |
+
cumulative_avg_rank[model_id] = np.nan
|
| 839 |
|
| 840 |
# --- Print and store iteration results IMMEDIATELY after ranking for this model ---
|
| 841 |
+
ranks_str = "[" + ", ".join(map(str, [ranks.get(m, None) for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
|
| 842 |
+
custom_print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank.get(model_id, np.nan):.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
|
| 843 |
|
| 844 |
# Write iteration results to file (append mode) - write for each model right after ranking
|
| 845 |
+
if iteration_results_file_opened:
|
| 846 |
+
try:
|
| 847 |
+
iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank.get(model_id, np.nan):.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
|
| 848 |
+
except Exception as e:
|
| 849 |
+
log_debug(f"Error writing to results file: {e}")
|
| 850 |
|
| 851 |
# Update model weights based on cumulative average ranks, handling NaNs
|
| 852 |
temp_weights = {}
|
|
|
|
| 868 |
for m_id in active_models:
|
| 869 |
model_weights[m_id] = 1.0 / len(active_models)
|
| 870 |
|
| 871 |
+
if iteration_results_file_opened:
|
| 872 |
+
iteration_results_file_opened.close()
|
| 873 |
|
| 874 |
custom_print(f"Unresponsive models during this run: {unresponsive_models}")
|
| 875 |
+
st.session_state['is_running'] = False
|
| 876 |
return results, cumulative_avg_rank, s_t
|
| 877 |
|
| 878 |
def check_model_availability(models, token):
|
|
|
|
| 998 |
st.error("You need at least 2 available models to run the benchmark")
|
| 999 |
|
| 1000 |
# Progress bar
|
| 1001 |
+
progress_bar = st.progress(st.session_state['progress'])
|
| 1002 |
|
| 1003 |
# Start benchmark button
|
| 1004 |
if st.sidebar.button("Start Benchmark"):
|
| 1005 |
# Clear previous outputs
|
| 1006 |
+
st.session_state['main_output'] = []
|
| 1007 |
+
st.session_state['debug_output'] = []
|
| 1008 |
+
st.session_state['progress'] = 0
|
| 1009 |
|
| 1010 |
if not hf_token:
|
| 1011 |
st.error("Please enter your Hugging Face API token")
|
|
|
|
| 1017 |
# Setup to capture results for display
|
| 1018 |
results_container = st.container()
|
| 1019 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1020 |
# Run the benchmark
|
| 1021 |
try:
|
| 1022 |
# Run benchmark and get results
|
|
|
|
| 1027 |
)
|
| 1028 |
|
| 1029 |
# Update progress to complete
|
| 1030 |
+
st.session_state['progress'] = 1.0
|
| 1031 |
progress_bar.progress(1.0)
|
| 1032 |
|
| 1033 |
# Display results
|
| 1034 |
if total_successful > 0:
|
| 1035 |
results_df = pd.DataFrame(results)
|
| 1036 |
+
st.session_state['results_df'] = results_df
|
| 1037 |
|
| 1038 |
# Show model rankings
|
| 1039 |
st.subheader("Model Rankings")
|
| 1040 |
ranking_df = pd.DataFrame({
|
| 1041 |
"Model": list(cumulative_avg_rank.keys()),
|
| 1042 |
+
"Average Rank": [round(r, 2) if not np.isnan(r) else 'N/A' for r in cumulative_avg_rank.values()]
|
| 1043 |
})
|
| 1044 |
ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
|
| 1045 |
st.dataframe(ranking_df)
|
|
|
|
| 1063 |
st.exception(e)
|
| 1064 |
|
| 1065 |
# Show previous results if available
|
| 1066 |
+
elif 'results_df' in st.session_state and not st.session_state['results_df'].empty:
|
| 1067 |
st.subheader("Previous Results")
|
| 1068 |
+
st.dataframe(st.session_state['results_df'])
|
| 1069 |
|
| 1070 |
with tab2:
|
| 1071 |
# Display main output log
|
| 1072 |
st.subheader("Execution Log")
|
| 1073 |
|
| 1074 |
# Display logs
|
| 1075 |
+
if 'main_output' in st.session_state:
|
| 1076 |
+
log_text = "\n".join(st.session_state['main_output'])
|
| 1077 |
+
st.text_area("Progress Log", log_text, height=400)
|
| 1078 |
+
else:
|
| 1079 |
+
st.text_area("Progress Log", "No progress logs yet.", height=400)
|
| 1080 |
|
| 1081 |
# Add a refresh button for the log
|
| 1082 |
if st.button("Refresh Progress Log"):
|
| 1083 |
+
st.experimental_rerun()
|
| 1084 |
|
| 1085 |
with tab3:
|
| 1086 |
# Display debug output
|
| 1087 |
st.subheader("Debug Log")
|
| 1088 |
|
| 1089 |
# Display debug logs
|
| 1090 |
+
if 'debug_output' in st.session_state:
|
| 1091 |
+
debug_text = "\n".join(st.session_state['debug_output'])
|
| 1092 |
+
st.text_area("Debug Information", debug_text, height=400)
|
| 1093 |
+
else:
|
| 1094 |
+
st.text_area("Debug Information", "No debug logs yet.", height=400)
|
| 1095 |
|
| 1096 |
# Add a refresh button for the debug log
|
| 1097 |
if st.button("Refresh Debug Log"):
|
| 1098 |
+
st.experimental_rerun()
|
| 1099 |
+
|
| 1100 |
+
# Auto-update while benchmark is running
|
| 1101 |
+
if st.session_state.get('is_running', False):
|
| 1102 |
+
st.empty()
|
| 1103 |
+
time.sleep(5) # Update every 5 seconds while running
|
| 1104 |
+
st.rerun()
|