Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -63,6 +63,7 @@ def retry_api_request(max_retries=3, wait_time=10):
|
|
| 63 |
return decorator
|
| 64 |
|
| 65 |
# --- Single model request function for Hugging Face ---
|
|
|
|
| 66 |
@retry_api_request()
|
| 67 |
def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
|
| 68 |
"""
|
|
@@ -804,6 +805,51 @@ def run_benchmark(hf_models, topics, difficulties, t, model_config, token=None):
|
|
| 804 |
print(f"Unresponsive models during this run: {unresponsive_models}")
|
| 805 |
return results, cumulative_avg_rank, s_t
|
| 806 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
# Streamlit UI
|
| 808 |
st.title("LLM Benchmark")
|
| 809 |
|
|
@@ -848,6 +894,35 @@ model_config = {}
|
|
| 848 |
for model in selected_models:
|
| 849 |
model_config[model] = {"name": model, "role": "both"}
|
| 850 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 851 |
# Start benchmark button
|
| 852 |
if st.sidebar.button("Start Benchmark"):
|
| 853 |
if not hf_token:
|
|
|
|
| 63 |
return decorator
|
| 64 |
|
| 65 |
# --- Single model request function for Hugging Face ---
|
| 66 |
+
|
| 67 |
@retry_api_request()
|
| 68 |
def make_hf_request(model_name, messages, temperature, max_tokens, token=None):
|
| 69 |
"""
|
|
|
|
| 805 |
print(f"Unresponsive models during this run: {unresponsive_models}")
|
| 806 |
return results, cumulative_avg_rank, s_t
|
| 807 |
|
| 808 |
+
def check_model_availability(models, token):
|
| 809 |
+
"""Test if models are available with the provided token"""
|
| 810 |
+
availability_results = {}
|
| 811 |
+
|
| 812 |
+
for model_name in models:
|
| 813 |
+
st.write(f"Testing availability of {model_name}...")
|
| 814 |
+
try:
|
| 815 |
+
# Create a simple test prompt
|
| 816 |
+
test_prompt = "Hello, are you available?"
|
| 817 |
+
|
| 818 |
+
# Use a short timeout to quickly test connectivity
|
| 819 |
+
client = InferenceClient(model=model_name, token=token)
|
| 820 |
+
response = client.text_generation(
|
| 821 |
+
test_prompt,
|
| 822 |
+
max_new_tokens=10,
|
| 823 |
+
temperature=0.7,
|
| 824 |
+
do_sample=True
|
| 825 |
+
)
|
| 826 |
+
|
| 827 |
+
availability_results[model_name] = {
|
| 828 |
+
"available": True,
|
| 829 |
+
"response": response[:50] + "..." if len(response) > 50 else response
|
| 830 |
+
}
|
| 831 |
+
st.success(f"✅ {model_name} is available")
|
| 832 |
+
|
| 833 |
+
except Exception as e:
|
| 834 |
+
error_msg = str(e)
|
| 835 |
+
availability_results[model_name] = {
|
| 836 |
+
"available": False,
|
| 837 |
+
"error": error_msg
|
| 838 |
+
}
|
| 839 |
+
|
| 840 |
+
if "401" in error_msg or "unauthorized" in error_msg.lower():
|
| 841 |
+
st.error(f"❌ {model_name}: Authentication error. Check your API token.")
|
| 842 |
+
elif "404" in error_msg or "not found" in error_msg.lower():
|
| 843 |
+
st.error(f"❌ {model_name}: Model not found. It may not exist or you may not have access.")
|
| 844 |
+
elif "429" in error_msg or "rate limit" in error_msg.lower():
|
| 845 |
+
st.error(f"❌ {model_name}: Rate limit exceeded. Try again later.")
|
| 846 |
+
else:
|
| 847 |
+
st.error(f"❌ {model_name}: Unknown error: {error_msg}")
|
| 848 |
+
|
| 849 |
+
time.sleep(1) # Add delay between checks
|
| 850 |
+
|
| 851 |
+
return availability_results
|
| 852 |
+
|
| 853 |
# Streamlit UI
|
| 854 |
st.title("LLM Benchmark")
|
| 855 |
|
|
|
|
| 894 |
for model in selected_models:
|
| 895 |
model_config[model] = {"name": model, "role": "both"}
|
| 896 |
|
| 897 |
+
if st.sidebar.button("Test Selected Models"):
|
| 898 |
+
if not hf_token:
|
| 899 |
+
st.error("Please enter your Hugging Face API token")
|
| 900 |
+
elif not selected_models:
|
| 901 |
+
st.error("Please select at least one model")
|
| 902 |
+
else:
|
| 903 |
+
with st.spinner("Testing model availability..."):
|
| 904 |
+
availability = check_model_availability(selected_models, hf_token)
|
| 905 |
+
|
| 906 |
+
# Show results in a table
|
| 907 |
+
availability_df = pd.DataFrame([
|
| 908 |
+
{
|
| 909 |
+
"Model": model,
|
| 910 |
+
"Available": info["available"],
|
| 911 |
+
"Status": "Available" if info["available"] else "Error",
|
| 912 |
+
"Details": info.get("response", "") if info["available"] else info.get("error", "")
|
| 913 |
+
}
|
| 914 |
+
for model, info in availability.items()
|
| 915 |
+
])
|
| 916 |
+
|
| 917 |
+
st.dataframe(availability_df)
|
| 918 |
+
|
| 919 |
+
# Check if we have enough models to run the benchmark
|
| 920 |
+
available_models = [m for m, info in availability.items() if info["available"]]
|
| 921 |
+
if len(available_models) >= 2:
|
| 922 |
+
st.success(f"{len(available_models)} models are available for benchmarking")
|
| 923 |
+
else:
|
| 924 |
+
st.error("You need at least 2 available models to run the benchmark")
|
| 925 |
+
|
| 926 |
# Start benchmark button
|
| 927 |
if st.sidebar.button("Start Benchmark"):
|
| 928 |
if not hf_token:
|