Spaces:

Saurabh502
/

LLM_select

Sleeping

App Files Files Community

LLM_select / app.py

Saurabh502

Create app.py

d659b9e verified 7 months ago

raw

history blame contribute delete

8.76 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go

	# Set page config
	st.set_page_config(
	page_title="LLM Evaluation Framework",
	page_icon="🤖",
	layout="wide"
	)

	# Title and description
	st.title("🤖 LLM Quantitative Evaluation Framework")
	st.markdown("Data-driven decision making for Large Language Model selection")

	# Model data
	models_data = {
	"Model": ["GPT-4 Turbo", "Claude 3 Opus", "Claude 3 Sonnet", "Gemini Pro", "Llama 2 70B", "Mistral 7B"],
	"Provider": ["OpenAI", "Anthropic", "Anthropic", "Google", "Meta", "Mistral AI"],
	"Open Source": [False, False, False, False, True, True],
	"Parameters (B)": [1700, 500, 200, 340, 70, 7],
	"Context Length (K)": [128, 200, 200, 32, 4, 8],
	"Input Cost ($/1K tokens)": [0.01, 0.015, 0.003, 0.0005, 0.0007, 0.0002],
	"Output Cost ($/1K tokens)": [0.03, 0.075, 0.015, 0.0015, 0.0009, 0.0002],
	"Speed (tokens/s)": [40, 35, 45, 50, 30, 60],
	"Latency (s)": [2.5, 3.0, 2.0, 1.8, 4.0, 1.5],
	"Uptime (%)": [99.9, 99.8, 99.8, 99.9, 95.0, 94.0],
	"Rate Limit (req/min)": [500, 400, 600, 1000, 200, 100],
	"Knowledge Cutoff": ["2023-04", "2023-08", "2023-08", "2023-11", "2023-07", "2023-09"]
	}

	df = pd.DataFrame(models_data)

	# Sidebar for weights
	st.sidebar.header("🎯 Evaluation Criteria Weights")
	st.sidebar.markdown("Adjust the importance of each factor (total should equal 100%)")

	weights = {}
	weights['performance'] = st.sidebar.slider("Performance", 0, 50, 25)
	weights['cost'] = st.sidebar.slider("Cost Efficiency", 0, 50, 25)
	weights['speed'] = st.sidebar.slider("Speed", 0, 50, 20)
	weights['reliability'] = st.sidebar.slider("Reliability", 0, 50, 15)
	weights['compliance'] = st.sidebar.slider("Compliance/Open Source", 0, 50, 10)
	weights['integration'] = st.sidebar.slider("Integration Ease", 0, 50, 5)

	total_weights = sum(weights.values())
	st.sidebar.write(f"Total: {total_weights}%")
	if total_weights != 100:
	st.sidebar.warning("⚠️ Weights should total 100%")

	# Usage scenario
	st.sidebar.header("📊 Usage Scenario")
	monthly_requests = st.sidebar.number_input("Monthly Requests", value=100000, step=10000)
	avg_input_tokens = st.sidebar.number_input("Avg Input Tokens", value=500, step=50)
	avg_output_tokens = st.sidebar.number_input("Avg Output Tokens", value=200, step=50)

	# Scoring functions
	def calculate_performance_score(row):
	param_score = min((row['Parameters (B)'] / 1700) * 100, 100)
	context_score = min((row['Context Length (K)'] / 200) * 100, 100)
	freshness_score = 100 if row['Knowledge Cutoff'] >= "2023-08" else 70
	return param_score * 0.4 + context_score * 0.4 + freshness_score * 0.2

	def calculate_cost_score(row):
	monthly_cost = monthly_requests * (
	(avg_input_tokens / 1000) * row['Input Cost ($/1K tokens)'] +
	(avg_output_tokens / 1000) * row['Output Cost ($/1K tokens)']
	)
	max_cost = 5000
	return max(0, 100 - (monthly_cost / max_cost) * 100)

	def calculate_speed_score(row):
	speed_score = (row['Speed (tokens/s)'] / 60) * 50
	latency_score = max(0, 50 - (row['Latency (s)'] / 5) * 50)
	return speed_score + latency_score

	def calculate_reliability_score(row):
	uptime_score = (row['Uptime (%)'] / 100) * 60
	rate_limit_score = min((row['Rate Limit (req/min)'] / 1000) * 40, 40)
	return uptime_score + rate_limit_score

	def calculate_compliance_score(row):
	open_source_bonus = 40 if row['Open Source'] else 0
	return open_source_bonus + 60

	def calculate_integration_score(row):
	api_score = 70 if not row['Open Source'] else 30
	support_score = 30 if row['Provider'] in ["OpenAI", "Google"] else 20
	return min(api_score + support_score, 100)

	# Calculate scores
	df['Performance Score'] = df.apply(calculate_performance_score, axis=1)
	df['Cost Score'] = df.apply(calculate_cost_score, axis=1)
	df['Speed Score'] = df.apply(calculate_speed_score, axis=1)
	df['Reliability Score'] = df.apply(calculate_reliability_score, axis=1)
	df['Compliance Score'] = df.apply(calculate_compliance_score, axis=1)
	df['Integration Score'] = df.apply(calculate_integration_score, axis=1)

	# Calculate weighted overall score
	if total_weights > 0:
	df['Overall Score'] = (
	df['Performance Score'] * weights['performance'] / 100 +
	df['Cost Score'] * weights['cost'] / 100 +
	df['Speed Score'] * weights['speed'] / 100 +
	df['Reliability Score'] * weights['reliability'] / 100 +
	df['Compliance Score'] * weights['compliance'] / 100 +
	df['Integration Score'] * weights['integration'] / 100
	) * (100 / total_weights)
	else:
	df['Overall Score'] = 0

	# Sort by overall score
	df_sorted = df.sort_values('Overall Score', ascending=False).reset_index(drop=True)

	# Calculate monthly costs
	df_sorted['Monthly Cost ($)'] = monthly_requests * (
	(avg_input_tokens / 1000) * df_sorted['Input Cost ($/1K tokens)'] +
	(avg_output_tokens / 1000) * df_sorted['Output Cost ($/1K tokens)']
	)

	# Main content area
	col1, col2 = st.columns([2, 1])

	with col1:
	st.header("🏆 Model Rankings")

	# Display top 3 models with medals
	medals = ["🥇", "🥈", "🥉"]
	for i in range(min(3, len(df_sorted))):
	with st.container():
	st.markdown(f"""
	<div style="border: 2px solid {'gold' if i==0 else 'silver' if i==1 else '#CD7F32'};
	border-radius: 10px; padding: 15px; margin: 10px 0;
	background-color: {'#FFF8DC' if i==0 else '#F8F8FF' if i==1 else '#FDF5E6'}">
	<h3>{medals[i]} {df_sorted.iloc[i]['Model']} - {df_sorted.iloc[i]['Provider']}</h3>
	<p><strong>Overall Score: {df_sorted.iloc[i]['Overall Score']:.1f}/100</strong></p>
	<p>Monthly Cost: ${df_sorted.iloc[i]['Monthly Cost ($)']:.2f} \|
	Parameters: {df_sorted.iloc[i]['Parameters (B)']}B \|
	Context: {df_sorted.iloc[i]['Context Length (K)']}K tokens</p>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	st.header("💰 Cost Analysis")

	# Cost comparison chart
	fig_cost = px.bar(
	df_sorted,
	x='Monthly Cost ($)',
	y='Model',
	orientation='h',
	title="Monthly Cost Comparison",
	color='Monthly Cost ($)',
	color_continuous_scale='RdYlGn_r'
	)
	fig_cost.update_layout(height=400)
	st.plotly_chart(fig_cost, use_container_width=True)

	# Detailed comparison table
	st.header("📊 Detailed Comparison")
	display_cols = ['Model', 'Provider', 'Overall Score', 'Monthly Cost ($)',
	'Performance Score', 'Cost Score', 'Speed Score',
	'Reliability Score', 'Compliance Score', 'Integration Score']
	st.dataframe(df_sorted[display_cols].round(1), use_container_width=True)

	# Radar chart for top 3 models
	st.header("🎯 Multi-Dimensional Analysis")
	categories = ['Performance', 'Cost', 'Speed', 'Reliability', 'Compliance', 'Integration']

	fig_radar = go.Figure()

	colors = ['gold', 'silver', '#CD7F32']
	for i in range(min(3, len(df_sorted))):
	model = df_sorted.iloc[i]
	values = [
	model['Performance Score'],
	model['Cost Score'],
	model['Speed Score'],
	model['Reliability Score'],
	model['Compliance Score'],
	model['Integration Score']
	]

	fig_radar.add_trace(go.Scatterpolar(
	r=values,
	theta=categories,
	fill='toself',
	name=model['Model'],
	line_color=colors[i]
	))

	fig_radar.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, 100]
	)),
	showlegend=True,
	title="Top 3 Models - Multi-Dimensional Comparison"
	)

	st.plotly_chart(fig_radar, use_container_width=True)

	# Methodology
	st.header("🔬 Scoring Methodology")
	st.markdown("""
	Performance Score (0-100):
	- Parameters: 40% weight (normalized to GPT-4's 1.7T)
	- Context Length: 40% weight (normalized to 200K tokens)
	- Knowledge Freshness: 20% weight (post-Aug 2023 = 100, else 70)

	Cost Efficiency Score (0-100):
	- Based on total monthly cost for your usage scenario
	- Normalized against $5,000/month baseline
	- Higher score = lower cost

	Speed Score (0-100):
	- Tokens/second: 50% weight (normalized to 60 tok/s)
	- Latency (inverse): 50% weight (normalized to 5s max)

	Reliability Score (0-100):
	- Uptime percentage: 60% weight
	- Rate limits: 40% weight (normalized to 1000 req/min)

	Compliance Score (0-100):
	- Open source availability: 40 points
	- License permissiveness: 60 points

	Integration Score (0-100):
	- API availability: 70 points (closed source) or 30 points (open source)
	- Provider support quality: 30 points
	""")