Spaces:

AutoBench
/

AutoBench_1.0_Demo

Sleeping

App Files Files Community

PeterKruger commited on Feb 28, 2025

Commit

819adb7

verified ·

1 Parent(s): 130972f

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -196

app.py CHANGED Viewed

@@ -8,52 +8,9 @@ import re
 import time
 import random
 import functools
-# Create areas for different outputs
-status_area = st.empty()  # For current status
-progress_area = st.empty()  # For progress updates
-# Initialize session state variables
-if 'log_messages' not in st.session_state:
-    st.session_state.log_messages = []
-if 'results_df' not in st.session_state:
-    st.session_state.results_df = pd.DataFrame()
-# Collapsible section for logs
-with st.expander("Execution Log", expanded=False):
-    log_area = st.empty()
-def update_log():
-    """Update the log display with current messages"""
-    log_area.text_area("System Log", value="\n".join(st.session_state.log_messages), height=300)
-def log_message(message, level="INFO"):
-    """Log a message with timestamp and level"""
-    timestamp = time.strftime("%H:%M:%S")
-    formatted_msg = f"[{timestamp}] {level}: {message}"
-    st.session_state.log_messages.append(formatted_msg)
-    # Limit log size
-    if len(st.session_state.log_messages) > 500:
-        st.session_state.log_messages = st.session_state.log_messages[-500:]
-    update_log()
-# Specialized logging functions
-def log_info(message):
-    log_message(message, "INFO")
-def log_warning(message):
-    log_message(message, "WARNING")
-def log_error(message):
-    log_message(message, "ERROR")
-# Function to update status
-def update_status(message):
-    status_area.write(message)
-# Function to update progress message
-def update_progress(message):
-    progress_area.write(message)
 # FILES
 iteration_output_file = "llm_benchmark_iteration_results.csv"  # File to store iteration results, defined as global
@@ -84,6 +41,28 @@ difficulty_probabilities = {
     "a very difficult": 0.6
 }
 def retry_api_request(max_retries=3, wait_time=10):
     """Decorator for retrying API requests with rate limit handling."""
     def decorator(func):
@@ -94,13 +73,16 @@ def retry_api_request(max_retries=3, wait_time=10):
                 try:
                     return func(*args, **kwargs)
                 except Exception as e:
-                    log_error(f"API error: {e}")
                     if retries < max_retries:
-                        log_info(f"Waiting for {wait_time} seconds before retrying... (Retry {retries + 1}/{max_retries})")
                         time.sleep(wait_time)
                         retries += 1
                     else:
-                        log_error(f"Max retries reached. Request failed.")
                         return None
             return None
@@ -147,7 +129,8 @@ def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
         )
         return response
     except Exception as e:
-        log_error(f"Hugging Face Inference API error: {e}")
         return None
 # --- Prompting Functions ---
@@ -332,7 +315,7 @@ def generate_question_prompt(topic, difficulty):
     if topic in topic_instructions:
         prompt += random.choice(topic_instructions[topic]) + "\n"
     else:
-        log_warning(f"No topic_instructions defined for topic '{topic}'")
     # 5. Conditional Question Types (Not for math, logics, grammar)
     if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
@@ -418,14 +401,14 @@ def parse_rank_string(rank_str, ranking_model_id):
         try:
             rank_val = int(rank_str) # Convert to integer *after* regex extraction
             if not 1 <= rank_val <= 5: # Check if rank is within valid range
-                log_warning(f"Model {ranking_model_id} returned rank outside of valid range [1-5]: {rank_val}. Rank set to None.")
                 return None
             return rank_val
         except ValueError:
-            log_warning(f"Model {ranking_model_id} returned non-integer rank after regex extraction: '{rank_str}'. Rank set to None.")
             return None
     else:
-        log_warning(f"Model {ranking_model_id} returned non-numeric rank: '{rank_str}'. Rank set to None.")
         return None
 # --- Helper Function for Parallel Ranking ---
@@ -442,18 +425,18 @@ def get_rank_from_model(ranking_model_id, question, answer, consecutive_failures
                 rank_str = response.strip()
                 rank = parse_rank_string(rank_str, ranking_model_id)
             except ValueError:
-                log_warning(f"Model {ranking_model_id} returned non-integer rank: '{rank_str}'. Rank set to None.")
                 rank = None
         else:
-            log_warning(f"Model {ranking_model_id} failed to provide rank. Rank set to None.")
     except Exception as e:
         duration = time.time() - start_time
-        log_warning(f"Model {ranking_model_id} ranking timed out or failed after {duration:.2f}s: {e}")
         rank = None
     duration = time.time() - start_time # Calculate total duration of ranking attempt
     if duration > timeout:
-        log_warning(f"Ranking by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
         rank = None # Ensure rank is None if timeout occurs
     time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
@@ -473,18 +456,18 @@ def get_question_rank_from_model(ranking_model_id, question, topic, difficulty,
                 rank_str = response.strip()
                 rank = parse_rank_string(rank_str, ranking_model_id)
             except ValueError:
-                log_warning(f"Model {ranking_model_id} returned non-integer rank for question: '{rank_str}'. Rank set to None.")
                 rank = None
         else:
-            log_warning(f"Model {ranking_model_id} failed to provide rank for question. Rank set to None.")
     except Exception as e:
         duration = time.time() - start_time
-        log_warning(f"Model {ranking_model_id} ranking question timed out or failed after {duration:.2f}s: {e}")
         rank = None
     duration = time.time() - start_time # Calculate total duration of ranking attempt
     if duration > timeout:
-        log_warning(f"Ranking question by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
         rank = None # Ensure rank is None if timeout occurs
     time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
@@ -508,13 +491,13 @@ def get_answer_from_model(model_id, question, consecutive_failures, failure_thre
             answer = response.strip()
     except Exception as e:
         duration = time.time() - start_time
-        log_warning(f"Model {model_id} answering timed out or failed after {duration:.2f}s: {e}")
         answer = "Error answering - Timeout" # Or a specific timeout error message
         return answer, duration # Return error answer and duration
     time.sleep(time_sleep) # Small delay
     duration = time.time() - start_time # Calculate duration
-    st.write(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately
     return answer, duration # Return answer and duration
@@ -569,18 +552,17 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
     s_t = 0 #count succesful iterations
     for iteration in range(t): # Added iteration counter
-        # Update the progress bar
-        progress_percentage = min(100, (iteration / t) * 100)
-        st.progress(progress_percentage)
         if len(active_models) < 2:
-            st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
             break
         topic = random.choice(topics)
         # --- Select difficulty with probabilities ---
         difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
-        update_status(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---")
         # --- Question Generation ---
         question = None
@@ -601,13 +583,12 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                 if model_config[model_id].get("role", "both") in ["answer", "both"]
             ]
             if not question_gen_candidates: # No suitable models left
-                st.warning("No models available for question generation with 'answer' or 'both' role. Skipping iteration.")
                 continue # Skip to next iteration
             question_generator_model_id = random.choice(question_gen_candidates)
             # --- Question Generation ---
-            update_progress(f"Generating question using model {question_generator_model_id}...")
             response = make_hf_request(model_config[question_generator_model_id]["name"],
                                       [{"role": "user", "content": question_prompt}],
                                       question_temp,
@@ -619,26 +600,25 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                 consecutive_failures[question_generator_model_id] = 0  # Reset on success
                 break
             else:
-                log_warning(f"Skipping due to request failure for model {question_generator_model_id}.")
                 consecutive_failures[question_generator_model_id] += 1
             if consecutive_failures[question_generator_model_id] >= failure_threshold:
-                st.warning(f"Model {question_generator_model_id} is unresponsive (question gen). Removing from active models.")
                 if question_generator_model_id in active_models:
                     active_models.remove(question_generator_model_id)
                 unresponsive_models.add(question_generator_model_id)
             time.sleep(time_sleep)
         if question is None:
-            st.warning(f"Failed to generate a question after {max_attempts} attempts. Skipping this round.")
             continue
         # --- Parallel Question Ranking ---
         question_ranks = {}
         question_ranking_futures = []
         question_ranking_start_time = time.time()
-        update_progress(f"Ranking generated question...")
         with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
             for ranking_model_id in active_models:
                 # --- Filter for ranking roles ("rank" or "both") ---
@@ -677,34 +657,33 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             #check that the length is correct
             if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
-                log_warning("Mismatch length of weights and valid question ranks")
-                log_info(f'weights_for_valid_question_ranks {weights_for_valid_question_ranks}')
-                log_info(f'valid_question_ranks_values: {valid_question_ranks_values}')
             question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
         min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
         if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
             question_accepted = True
-            st.write(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
             s_t += 1
         else:
             question_accepted = False
-            st.write(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
         if not question_accepted:
-            update_progress("Generated question was not accepted. Regenerating question.")
             continue
         if len(active_models) < 2:
-          st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
           break
         # --- Parallel Answer Generation ---
         answers = {}
         answer_futures = []
         answer_durations = {}
-        update_progress("Generating answers from all models...")
         with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
             for model_id in active_models:
                 # --- Filter for answer generation roles ("answer" or "both") ---
@@ -724,7 +703,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                         )
                         answer_futures.append(future)
                     except TimeoutError as e:
-                        log_error(f"Answer generation for model {model_id} timed out: {e}")
                         answer = "I am struggling to answer this question" # Treat timeout as error
                         duration = 120 # You can set a default duration or handle it differently if needed
                         answers[model_id] = answer # Store error answer
@@ -743,15 +722,14 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         if iteration == 0: # Write header only for the first iteration
             iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
-        update_progress("Ranking all answers...")
         for model_id in active_models:
-            answer = answers.get(model_id, "Error answering") # Retrieve pre-generated answer, default if not found
-            duration = answer_durations.get(model_id, 0)  # Get duration with default
             if answer == "Error answering": # Handle answer generation errors
                 consecutive_failures[model_id] += 1
                 if consecutive_failures[model_id] >= failure_threshold:
-                    st.warning(f"Model {model_id} is consistently failing to answer. Removing from active models.")
                     if model_id in active_models: # double check before removing, might have been removed in another thread
                         active_models.remove(model_id)
                     unresponsive_models.add(model_id)
@@ -759,7 +737,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             if len(active_models) < 2: # Re-check active models before ranking
-              st.warning("Fewer than 2 active models remaining. Exiting benchmark.")
               break
             ranks = {}
@@ -804,9 +782,9 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
                 if len(weights_for_valid_ranks) != len(valid_ranks_values):
-                    log_warning("Mismatch length of weights and valid answer ranks")
-                    log_info(f'weights_for_valid_ranks {weights_for_valid_ranks}')
-                    log_info(f'valid_ranks_values: {valid_ranks_values}')
                 average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
@@ -824,14 +802,11 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
             results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
             cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
-            if cumulative_model_ranks[model_id]:  # Check if the list is not empty
-                cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id])
-            else:
-                cumulative_avg_rank[model_id] = np.nan
             # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
             ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
-            st.write(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
             # Write iteration results to file (append mode) - write for each model right after ranking
             iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
@@ -841,10 +816,10 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         total_valid_rank = 0  # Keep track of the sum of valid (non-NaN) ranks
         for m_id in active_models:
-            if m_id in cumulative_avg_rank and not np.isnan(cumulative_avg_rank[m_id]):
                 temp_weights[m_id] = cumulative_avg_rank[m_id]
                 total_valid_rank += cumulative_avg_rank[m_id]
-            else: # if cumulative is empty or NaN, keep original
                 temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
         # Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
@@ -858,7 +833,7 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
         iteration_results_file_opened.close()
-    st.write(f"Unresponsive models during this run: {unresponsive_models}")
     return results, cumulative_avg_rank, s_t
 def check_model_availability(models, token):
@@ -909,6 +884,10 @@ def check_model_availability(models, token):
 # Streamlit UI
 st.title("LLM Benchmark")
 # Setup sidebar for configuration
 st.sidebar.header("Configuration")
@@ -950,103 +929,130 @@ model_config = {}
 for model in selected_models:
     model_config[model] = {"name": model, "role": "both"}
-if st.sidebar.button("Test Selected Models"):
-    if not hf_token:
-        st.error("Please enter your Hugging Face API token")
-    elif not selected_models:
-        st.error("Please select at least one model")
-    else:
-        with st.spinner("Testing model availability..."):
-            availability = check_model_availability(selected_models, hf_token)
-            # Show results in a table
-            availability_df = pd.DataFrame([
-                {
-                    "Model": model,
-                    "Available": info["available"],
-                    "Status": "Available" if info["available"] else "Error",
-                    "Details": info.get("response", "") if info["available"] else info.get("error", "")
-                }
-                for model, info in availability.items()
-            ])
-            st.dataframe(availability_df)
-            # Check if we have enough models to run the benchmark
-            available_models = [m for m, info in availability.items() if info["available"]]
-            if len(available_models) >= 2:
-                st.success(f"{len(available_models)} models are available for benchmarking")
-            else:
-                st.error("You need at least 2 available models to run the benchmark")
-# Start benchmark button
-if st.sidebar.button("Start Benchmark"):
-    if not hf_token:
-        st.error("Please enter your Hugging Face API token")
-    elif not selected_models:
-        st.error("Please select at least two models")
-    elif not selected_topics:
-        st.error("Please select at least one topic")
-    else:
-        # Create progress bar
-        progress_bar = st.progress(0)
-        status_text = st.empty()
-        # Setup to capture results for display
-        results_container = st.container()
-        with results_container:
-            results_placeholder = st.empty()
-            iterations_table = st.empty()
-        # Run the benchmark
-        try:
-            # Update status
-            status_text.text("Benchmark running...")
-            # Run benchmark and get results
-            results, cumulative_avg_rank, total_successful = run_benchmark(
-                selected_models, selected_topics,
-                ["a very simple", "a simple", "a", "a difficult", "a very difficult"],
-                num_iterations, model_config, hf_token
-            )
-            # Update progress to complete
-            progress_bar.progress(100)
-            status_text.text(f"Benchmark completed! {total_successful} successful iterations")
-            # Display results
-            if total_successful > 0:
-                results_df = pd.DataFrame(results)
-                st.session_state.results_df = results_df
-                # Show model rankings
-                st.subheader("Model Rankings")
-                ranking_df = pd.DataFrame({
-                    "Model": list(cumulative_avg_rank.keys()),
-                    "Average Rank": [round(r, 2) for r in cumulative_avg_rank.values()]
-                })
-                ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
-                results_placeholder.dataframe(ranking_df)
-                # Show detailed results
-                st.subheader("Detailed Results")
-                st.dataframe(results_df)
-                # Option to download results
-                csv = results_df.to_csv(index=False)
-                st.download_button(
-                    label="Download Results CSV",
-                    data=csv,
-                    file_name="llm_benchmark_results.csv",
-                    mime="text/csv",
-                )
-            else:
-                st.warning("The benchmark did not complete any successful iterations.")
-        except Exception as e:
-            st.error(f"An error occurred: {e}")
-            st.exception(e)
-# Show previous results if available
-elif 'results_df' in st.session_state and not st.session_state.results_df.empty:
-    st.subheader("Previous Results")
-    st.dataframe(st.session_state.results_df)

 import time
 import random
 import functools
+import sys
+import io
+from contextlib import redirect_stdout, redirect_stderr
 # FILES
 iteration_output_file = "llm_benchmark_iteration_results.csv"  # File to store iteration results, defined as global
     "a very difficult": 0.6
 }
+# Create output displays for main log and debug log
+if 'main_output' not in st.session_state:
+    st.session_state.main_output = []
+if 'debug_output' not in st.session_state:
+    st.session_state.debug_output = []
+# Custom print function to capture output
+def custom_print(*args, **kwargs):
+    # Convert args to string and join with spaces
+    output = ' '.join(map(str, args))
+    # Add to main output list
+    st.session_state.main_output.append(output)
+    # Also print to standard output for console logging
+    print(*args, **kwargs)
+# Custom function to capture warnings and errors
+def log_debug(message):
+    st.session_state.debug_output.append(message)
+    print(f"DEBUG: {message}", file=sys.stderr)
 def retry_api_request(max_retries=3, wait_time=10):
     """Decorator for retrying API requests with rate limit handling."""
     def decorator(func):
                 try:
                     return func(*args, **kwargs)
                 except Exception as e:
+                    error_msg = f"API error: {e}"
+                    log_debug(error_msg)
                     if retries < max_retries:
+                        retry_msg = f"Waiting for {wait_time} seconds before retrying... (Retry {retries + 1}/{max_retries})"
+                        log_debug(retry_msg)
                         time.sleep(wait_time)
                         retries += 1
                     else:
+                        failure_msg = f"Max retries reached. Request failed."
+                        log_debug(failure_msg)
                         return None
             return None
         )
         return response
     except Exception as e:
+        error_msg = f"Hugging Face Inference API error: {e}"
+        log_debug(error_msg)
         return None
 # --- Prompting Functions ---
     if topic in topic_instructions:
         prompt += random.choice(topic_instructions[topic]) + "\n"
     else:
+        log_debug(f"Warning: No topic_instructions defined for topic '{topic}'")
     # 5. Conditional Question Types (Not for math, logics, grammar)
     if topic not in ["math", "logics", "grammar", "coding", "creative writing"]:
         try:
             rank_val = int(rank_str) # Convert to integer *after* regex extraction
             if not 1 <= rank_val <= 5: # Check if rank is within valid range
+                log_debug(f"Warning: Model {ranking_model_id} returned rank outside of valid range [1-5]: {rank_val}. Rank set to None.")
                 return None
             return rank_val
         except ValueError:
+            log_debug(f"Warning: Model {ranking_model_id} returned non-integer rank after regex extraction: '{rank_str}'. Rank set to None.")
             return None
     else:
+        log_debug(f"Warning: Model {ranking_model_id} returned non-numeric rank: '{rank_str}'. Rank set to None.")
         return None
 # --- Helper Function for Parallel Ranking ---
                 rank_str = response.strip()
                 rank = parse_rank_string(rank_str, ranking_model_id)
             except ValueError:
+                log_debug(f"Warning: Model {ranking_model_id} returned non-integer rank: '{rank_str}'. Rank set to None.")
                 rank = None
         else:
+            log_debug(f"Warning: Model {ranking_model_id} failed to provide rank. Rank set to None.")
     except Exception as e:
         duration = time.time() - start_time
+        log_debug(f"Warning: Model {ranking_model_id} ranking timed out or failed after {duration:.2f}s: {e}")
         rank = None
     duration = time.time() - start_time # Calculate total duration of ranking attempt
     if duration > timeout:
+        log_debug(f"Warning: Ranking by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
         rank = None # Ensure rank is None if timeout occurs
     time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
                 rank_str = response.strip()
                 rank = parse_rank_string(rank_str, ranking_model_id)
             except ValueError:
+                log_debug(f"Warning: Model {ranking_model_id} returned non-integer rank for question: '{rank_str}'. Rank set to None.")
                 rank = None
         else:
+            log_debug(f"Warning: Model {ranking_model_id} failed to provide rank for question. Rank set to None.")
     except Exception as e:
         duration = time.time() - start_time
+        log_debug(f"Warning: Model {ranking_model_id} ranking question timed out or failed after {duration:.2f}s: {e}")
         rank = None
     duration = time.time() - start_time # Calculate total duration of ranking attempt
     if duration > timeout:
+        log_debug(f"Warning: Ranking question by model {ranking_model_id} exceeded timeout of {timeout:.2f}s and took {duration:.2f}s.")
         rank = None # Ensure rank is None if timeout occurs
     time.sleep(time_sleep) # Keep a small delay to avoid overwhelming APIs even in parallel
             answer = response.strip()
     except Exception as e:
         duration = time.time() - start_time
+        log_debug(f"Warning: Model {model_id} answering timed out or failed after {duration:.2f}s: {e}")
         answer = "Error answering - Timeout" # Or a specific timeout error message
         return answer, duration # Return error answer and duration
     time.sleep(time_sleep) # Small delay
     duration = time.time() - start_time # Calculate duration
+    custom_print(f"Answer generation by \"{model_id}\": {duration:.2f}s") # Print answer generation duration separately as requested - as requested
     return answer, duration # Return answer and duration
     s_t = 0 #count succesful iterations
     for iteration in range(t): # Added iteration counter
+        # Update progress in the Streamlit app
+        st.session_state.progress = (iteration + 1) / t
         if len(active_models) < 2:
+            custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
             break
         topic = random.choice(topics)
         # --- Select difficulty with probabilities ---
         difficulty = random.choices(difficulty_choices, weights=probability_values, k=1)[0] # Weighted random choice
+        custom_print(f"--- Iteration {s_t + 1}/{t}: {difficulty} question ({difficulty_mapping[difficulty]}) on {topic} ---") # Print iteration number
         # --- Question Generation ---
         question = None
                 if model_config[model_id].get("role", "both") in ["answer", "both"]
             ]
             if not question_gen_candidates: # No suitable models left
+                custom_print("No models available for question generation with 'answer' or 'both' role. Skipping iteration.")
                 continue # Skip to next iteration
             question_generator_model_id = random.choice(question_gen_candidates)
             # --- Question Generation ---
             response = make_hf_request(model_config[question_generator_model_id]["name"],
                                       [{"role": "user", "content": question_prompt}],
                                       question_temp,
                 consecutive_failures[question_generator_model_id] = 0  # Reset on success
                 break
             else:
+                custom_print(f"Skipping due to request failure for model {question_generator_model_id}.")
                 consecutive_failures[question_generator_model_id] += 1
             if consecutive_failures[question_generator_model_id] >= failure_threshold:
+                custom_print(f"Model {question_generator_model_id} is unresponsive (question gen). Removing from active models.")
                 if question_generator_model_id in active_models:
                     active_models.remove(question_generator_model_id)
                 unresponsive_models.add(question_generator_model_id)
             time.sleep(time_sleep)
         if question is None:
+            custom_print(f"Failed to generate a question after {max_attempts} attempts. Skipping this round.")
             continue
         # --- Parallel Question Ranking ---
         question_ranks = {}
         question_ranking_futures = []
         question_ranking_start_time = time.time()
         with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models) or 1) as executor:
             for ranking_model_id in active_models:
                 # --- Filter for ranking roles ("rank" or "both") ---
             #check that the length is correct
             if len(weights_for_valid_question_ranks) != len(valid_question_ranks_values):
+                log_debug("Warning: Mismatch length of weights and valid question ranks")
+                log_debug(f'weights_for_valid_question_ranks {weights_for_valid_question_ranks}')
+                log_debug(f'valid_question_ranks_values: {valid_question_ranks_values}')
             question_avg_rank = np.average(valid_question_ranks_values, weights=weights_for_valid_question_ranks)
         min_question_rank = min(valid_question_ranks_values) if valid_question_ranks_values else 0 # To avoid error if no valid rank
         if question_avg_rank >= question_treshold and all(rank > reject_rank for rank in valid_question_ranks_values): # Question acceptance criteria
             question_accepted = True
+            custom_print(f"Question accepted. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
             s_t += 1
         else:
             question_accepted = False
+            custom_print(f"Question rejected. Avg Question Rank: {question_avg_rank:.2f}, Min Rank: {min_question_rank}, Ranks: {[question_ranks[m] for m in active_models if m in question_ranks]}")
         if not question_accepted:
+            custom_print("Generated question was not accepted. Regenerating question.")
             continue
         if len(active_models) < 2:
+          custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
           break
         # --- Parallel Answer Generation ---
         answers = {}
         answer_futures = []
         answer_durations = {}
         with concurrent.futures.ThreadPoolExecutor(max_workers=len(active_models)) as executor:
             for model_id in active_models:
                 # --- Filter for answer generation roles ("answer" or "both") ---
                         )
                         answer_futures.append(future)
                     except TimeoutError as e:
+                        log_debug(f"Answer generation for model {model_id} timed out: {e}")
                         answer = "I am struggling to answer this question" # Treat timeout as error
                         duration = 120 # You can set a default duration or handle it differently if needed
                         answers[model_id] = answer # Store error answer
         if iteration == 0: # Write header only for the first iteration
             iteration_results_file_opened.write("Iteration, Topic, Difficulty, Question Rank, QR Duration, Model,Cumulative Avg Rank,Iteration Avg Rank,Ranks,Ranking Duration (sec)\n") # Added Ranking Duration to header
         for model_id in active_models:
+            answer = answers[model_id] # Retrieve pre-generated answer
             if answer == "Error answering": # Handle answer generation errors
                 consecutive_failures[model_id] += 1
                 if consecutive_failures[model_id] >= failure_threshold:
+                    custom_print(f"Model {model_id} is consistently failing to answer. Removing from active models.")
                     if model_id in active_models: # double check before removing, might have been removed in another thread
                         active_models.remove(model_id)
                     unresponsive_models.add(model_id)
             if len(active_models) < 2: # Re-check active models before ranking
+              custom_print("Fewer than 2 active models remaining. Exiting benchmark.")
               break
             ranks = {}
                 if len(weights_for_valid_ranks) != len(valid_ranks_values):
+                    log_debug("Warning: Mismatch length of weights and valid answer ranks")
+                    log_debug(f'weights_for_valid_ranks {weights_for_valid_ranks}')
+                    log_debug(f'valid_ranks_values: {valid_ranks_values}')
                 average_rank = np.average(valid_ranks_values, weights=weights_for_valid_ranks)
             results["question_rank_duration"].append(question_ranking_duration_total) # Store question ranking duration
             cumulative_model_ranks[model_id].append(average_rank) # Append current iteration's average rank
+            cumulative_avg_rank[model_id] = np.nanmean(cumulative_model_ranks[model_id]) if cumulative_model_ranks[model_id] else np.nan
             # --- Print and store iteration results IMMEDIATELY after ranking for this model ---
             ranks_str = "[" + ", ".join(map(str, [ranks[m] for m in active_models if m in ranks])) + "]" if ranks else "[]" # Format ranks for CSV, ensure order
+            custom_print(f"{topic}, {difficulty_mapping[difficulty]}, {model_id}, {cumulative_avg_rank[model_id]:.2f}, {average_rank:.5f}, {ranks_str}, {ranking_duration:.2f} sec")
             # Write iteration results to file (append mode) - write for each model right after ranking
             iteration_results_file_opened.write(f"{iteration+1},{topic}, {difficulty_mapping[difficulty]},{question_avg_rank:.2f},{question_ranking_duration_total:.2f},{model_id},{cumulative_avg_rank[model_id]:.2f},{average_rank:.2f},{ranks_str},{ranking_duration:.2f}\n")
         total_valid_rank = 0  # Keep track of the sum of valid (non-NaN) ranks
         for m_id in active_models:
+            if cumulative_avg_rank[m_id]:
                 temp_weights[m_id] = cumulative_avg_rank[m_id]
                 total_valid_rank += cumulative_avg_rank[m_id]
+            else: # if cumulative is empty, keep original
                 temp_weights[m_id] = model_weights.get(m_id, 1.0 / len(active_models))
         # Normalize the weights so they sum to 1, handling cases where total_valid_rank might be zero
         iteration_results_file_opened.close()
+    custom_print(f"Unresponsive models during this run: {unresponsive_models}")
     return results, cumulative_avg_rank, s_t
 def check_model_availability(models, token):
 # Streamlit UI
 st.title("LLM Benchmark")
+# Initialize session state variables for progress tracking
+if 'progress' not in st.session_state:
+    st.session_state.progress = 0
 # Setup sidebar for configuration
 st.sidebar.header("Configuration")
 for model in selected_models:
     model_config[model] = {"name": model, "role": "both"}
+# Create tabs for different views
+tab1, tab2, tab3 = st.tabs(["Benchmark", "Progress Log", "Debug Log"])
+with tab1:
+    if st.sidebar.button("Test Selected Models"):
+        if not hf_token:
+            st.error("Please enter your Hugging Face API token")
+        elif not selected_models:
+            st.error("Please select at least one model")
+        else:
+            with st.spinner("Testing model availability..."):
+                availability = check_model_availability(selected_models, hf_token)
+                # Show results in a table
+                availability_df = pd.DataFrame([
+                    {
+                        "Model": model,
+                        "Available": info["available"],
+                        "Status": "Available" if info["available"] else "Error",
+                        "Details": info.get("response", "") if info["available"] else info.get("error", "")
+                    }
+                    for model, info in availability.items()
+                ])
+                st.dataframe(availability_df)
+                # Check if we have enough models to run the benchmark
+                available_models = [m for m, info in availability.items() if info["available"]]
+                if len(available_models) >= 2:
+                    st.success(f"{len(available_models)} models are available for benchmarking")
+                else:
+                    st.error("You need at least 2 available models to run the benchmark")
+    # Progress bar
+    progress_bar = st.progress(st.session_state.progress)
+    # Start benchmark button
+    if st.sidebar.button("Start Benchmark"):
+        # Clear previous outputs
+        st.session_state.main_output = []
+        st.session_state.debug_output = []
+        if not hf_token:
+            st.error("Please enter your Hugging Face API token")
+        elif not selected_models:
+            st.error("Please select at least two models")
+        elif not selected_topics:
+            st.error("Please select at least one topic")
+        else:
+            # Setup to capture results for display
+            results_container = st.container()
+            # Create a global variable to store intermediate results
+            if 'results_df' not in st.session_state:
+                st.session_state.results_df = pd.DataFrame()
+            # Run the benchmark
+            try:
+                # Run benchmark and get results
+                results, cumulative_avg_rank, total_successful = run_benchmark(
+                    selected_models, selected_topics,
+                    ["a very simple", "a simple", "a", "a difficult", "a very difficult"],
+                    num_iterations, model_config, hf_token
+                )
+                # Update progress to complete
+                st.session_state.progress = 1.0
+                progress_bar.progress(1.0)
+                # Display results
+                if total_successful > 0:
+                    results_df = pd.DataFrame(results)
+                    st.session_state.results_df = results_df
+                    # Show model rankings
+                    st.subheader("Model Rankings")
+                    ranking_df = pd.DataFrame({
+                        "Model": list(cumulative_avg_rank.keys()),
+                        "Average Rank": [round(r, 2) for r in cumulative_avg_rank.values()]
+                    })
+                    ranking_df = ranking_df.sort_values("Average Rank", ascending=False)
+                    st.dataframe(ranking_df)
+                    # Show detailed results
+                    st.subheader("Detailed Results")
+                    st.dataframe(results_df)
+                    # Option to download results
+                    csv = results_df.to_csv(index=False)
+                    st.download_button(
+                        label="Download Results CSV",
+                        data=csv,
+                        file_name="llm_benchmark_results.csv",
+                        mime="text/csv",
+                    )
+                else:
+                    st.warning("The benchmark did not complete any successful iterations.")
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+                st.exception(e)
+    # Show previous results if available
+    elif 'results_df' in st.session_state and not st.session_state.results_df.empty:
+        st.subheader("Previous Results")
+        st.dataframe(st.session_state.results_df)
+with tab2:
+    # Display main output log
+    st.subheader("Execution Log")
+    log_container = st.container()
+    # Display logs
+    log_text = "\n".join(st.session_state.main_output)
+    log_container.text_area("Progress Log", log_text, height=400)
+    # Add a refresh button for the log
+    if st.button("Refresh Log"):
+        st.experimental_rerun()
+with tab3:
+    # Display debug output
+    st.subheader("Debug Log")
+    debug_container = st.container()
+    # Display debug logs
+    debug_text = "\n".join(st.session_state.debug_output)
+    debug_container.text_area("Debug Information", debug_text, height=400)