File size: 8,763 Bytes
d659b9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Set page config
st.set_page_config(
    page_title="LLM Evaluation Framework",
    page_icon="πŸ€–",
    layout="wide"
)

# Title and description
st.title("πŸ€– LLM Quantitative Evaluation Framework")
st.markdown("Data-driven decision making for Large Language Model selection")

# Model data
models_data = {
    "Model": ["GPT-4 Turbo", "Claude 3 Opus", "Claude 3 Sonnet", "Gemini Pro", "Llama 2 70B", "Mistral 7B"],
    "Provider": ["OpenAI", "Anthropic", "Anthropic", "Google", "Meta", "Mistral AI"],
    "Open Source": [False, False, False, False, True, True],
    "Parameters (B)": [1700, 500, 200, 340, 70, 7],
    "Context Length (K)": [128, 200, 200, 32, 4, 8],
    "Input Cost ($/1K tokens)": [0.01, 0.015, 0.003, 0.0005, 0.0007, 0.0002],
    "Output Cost ($/1K tokens)": [0.03, 0.075, 0.015, 0.0015, 0.0009, 0.0002],
    "Speed (tokens/s)": [40, 35, 45, 50, 30, 60],
    "Latency (s)": [2.5, 3.0, 2.0, 1.8, 4.0, 1.5],
    "Uptime (%)": [99.9, 99.8, 99.8, 99.9, 95.0, 94.0],
    "Rate Limit (req/min)": [500, 400, 600, 1000, 200, 100],
    "Knowledge Cutoff": ["2023-04", "2023-08", "2023-08", "2023-11", "2023-07", "2023-09"]
}

df = pd.DataFrame(models_data)

# Sidebar for weights
st.sidebar.header("🎯 Evaluation Criteria Weights")
st.sidebar.markdown("Adjust the importance of each factor (total should equal 100%)")

weights = {}
weights['performance'] = st.sidebar.slider("Performance", 0, 50, 25)
weights['cost'] = st.sidebar.slider("Cost Efficiency", 0, 50, 25)
weights['speed'] = st.sidebar.slider("Speed", 0, 50, 20)
weights['reliability'] = st.sidebar.slider("Reliability", 0, 50, 15)
weights['compliance'] = st.sidebar.slider("Compliance/Open Source", 0, 50, 10)
weights['integration'] = st.sidebar.slider("Integration Ease", 0, 50, 5)

total_weights = sum(weights.values())
st.sidebar.write(f"**Total: {total_weights}%**")
if total_weights != 100:
    st.sidebar.warning("⚠️ Weights should total 100%")

# Usage scenario
st.sidebar.header("πŸ“Š Usage Scenario")
monthly_requests = st.sidebar.number_input("Monthly Requests", value=100000, step=10000)
avg_input_tokens = st.sidebar.number_input("Avg Input Tokens", value=500, step=50)
avg_output_tokens = st.sidebar.number_input("Avg Output Tokens", value=200, step=50)

# Scoring functions
def calculate_performance_score(row):
    param_score = min((row['Parameters (B)'] / 1700) * 100, 100)
    context_score = min((row['Context Length (K)'] / 200) * 100, 100)
    freshness_score = 100 if row['Knowledge Cutoff'] >= "2023-08" else 70
    return param_score * 0.4 + context_score * 0.4 + freshness_score * 0.2

def calculate_cost_score(row):
    monthly_cost = monthly_requests * (
        (avg_input_tokens / 1000) * row['Input Cost ($/1K tokens)'] +
        (avg_output_tokens / 1000) * row['Output Cost ($/1K tokens)']
    )
    max_cost = 5000
    return max(0, 100 - (monthly_cost / max_cost) * 100)

def calculate_speed_score(row):
    speed_score = (row['Speed (tokens/s)'] / 60) * 50
    latency_score = max(0, 50 - (row['Latency (s)'] / 5) * 50)
    return speed_score + latency_score

def calculate_reliability_score(row):
    uptime_score = (row['Uptime (%)'] / 100) * 60
    rate_limit_score = min((row['Rate Limit (req/min)'] / 1000) * 40, 40)
    return uptime_score + rate_limit_score

def calculate_compliance_score(row):
    open_source_bonus = 40 if row['Open Source'] else 0
    return open_source_bonus + 60

def calculate_integration_score(row):
    api_score = 70 if not row['Open Source'] else 30
    support_score = 30 if row['Provider'] in ["OpenAI", "Google"] else 20
    return min(api_score + support_score, 100)

# Calculate scores
df['Performance Score'] = df.apply(calculate_performance_score, axis=1)
df['Cost Score'] = df.apply(calculate_cost_score, axis=1)
df['Speed Score'] = df.apply(calculate_speed_score, axis=1)
df['Reliability Score'] = df.apply(calculate_reliability_score, axis=1)
df['Compliance Score'] = df.apply(calculate_compliance_score, axis=1)
df['Integration Score'] = df.apply(calculate_integration_score, axis=1)

# Calculate weighted overall score
if total_weights > 0:
    df['Overall Score'] = (
        df['Performance Score'] * weights['performance'] / 100 +
        df['Cost Score'] * weights['cost'] / 100 +
        df['Speed Score'] * weights['speed'] / 100 +
        df['Reliability Score'] * weights['reliability'] / 100 +
        df['Compliance Score'] * weights['compliance'] / 100 +
        df['Integration Score'] * weights['integration'] / 100
    ) * (100 / total_weights)
else:
    df['Overall Score'] = 0

# Sort by overall score
df_sorted = df.sort_values('Overall Score', ascending=False).reset_index(drop=True)

# Calculate monthly costs
df_sorted['Monthly Cost ($)'] = monthly_requests * (
    (avg_input_tokens / 1000) * df_sorted['Input Cost ($/1K tokens)'] +
    (avg_output_tokens / 1000) * df_sorted['Output Cost ($/1K tokens)']
)

# Main content area
col1, col2 = st.columns([2, 1])

with col1:
    st.header("πŸ† Model Rankings")
    
    # Display top 3 models with medals
    medals = ["πŸ₯‡", "πŸ₯ˆ", "πŸ₯‰"]
    for i in range(min(3, len(df_sorted))):
        with st.container():
            st.markdown(f"""
            <div style="border: 2px solid {'gold' if i==0 else 'silver' if i==1 else '#CD7F32'}; 
                        border-radius: 10px; padding: 15px; margin: 10px 0;
                        background-color: {'#FFF8DC' if i==0 else '#F8F8FF' if i==1 else '#FDF5E6'}">
                <h3>{medals[i]} {df_sorted.iloc[i]['Model']} - {df_sorted.iloc[i]['Provider']}</h3>
                <p><strong>Overall Score: {df_sorted.iloc[i]['Overall Score']:.1f}/100</strong></p>
                <p>Monthly Cost: ${df_sorted.iloc[i]['Monthly Cost ($)']:.2f} | 
                   Parameters: {df_sorted.iloc[i]['Parameters (B)']}B | 
                   Context: {df_sorted.iloc[i]['Context Length (K)']}K tokens</p>
            </div>
            """, unsafe_allow_html=True)

with col2:
    st.header("πŸ’° Cost Analysis")
    
    # Cost comparison chart
    fig_cost = px.bar(
        df_sorted, 
        x='Monthly Cost ($)', 
        y='Model',
        orientation='h',
        title="Monthly Cost Comparison",
        color='Monthly Cost ($)',
        color_continuous_scale='RdYlGn_r'
    )
    fig_cost.update_layout(height=400)
    st.plotly_chart(fig_cost, use_container_width=True)

# Detailed comparison table
st.header("πŸ“Š Detailed Comparison")
display_cols = ['Model', 'Provider', 'Overall Score', 'Monthly Cost ($)', 
                'Performance Score', 'Cost Score', 'Speed Score', 
                'Reliability Score', 'Compliance Score', 'Integration Score']
st.dataframe(df_sorted[display_cols].round(1), use_container_width=True)

# Radar chart for top 3 models
st.header("🎯 Multi-Dimensional Analysis")
categories = ['Performance', 'Cost', 'Speed', 'Reliability', 'Compliance', 'Integration']

fig_radar = go.Figure()

colors = ['gold', 'silver', '#CD7F32']
for i in range(min(3, len(df_sorted))):
    model = df_sorted.iloc[i]
    values = [
        model['Performance Score'],
        model['Cost Score'], 
        model['Speed Score'],
        model['Reliability Score'],
        model['Compliance Score'],
        model['Integration Score']
    ]
    
    fig_radar.add_trace(go.Scatterpolar(
        r=values,
        theta=categories,
        fill='toself',
        name=model['Model'],
        line_color=colors[i]
    ))

fig_radar.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100]
        )),
    showlegend=True,
    title="Top 3 Models - Multi-Dimensional Comparison"
)

st.plotly_chart(fig_radar, use_container_width=True)

# Methodology
st.header("πŸ”¬ Scoring Methodology")
st.markdown("""
**Performance Score (0-100):**
- Parameters: 40% weight (normalized to GPT-4's 1.7T)
- Context Length: 40% weight (normalized to 200K tokens)
- Knowledge Freshness: 20% weight (post-Aug 2023 = 100, else 70)

**Cost Efficiency Score (0-100):**
- Based on total monthly cost for your usage scenario
- Normalized against $5,000/month baseline
- Higher score = lower cost

**Speed Score (0-100):**
- Tokens/second: 50% weight (normalized to 60 tok/s)
- Latency (inverse): 50% weight (normalized to 5s max)

**Reliability Score (0-100):**
- Uptime percentage: 60% weight
- Rate limits: 40% weight (normalized to 1000 req/min)

**Compliance Score (0-100):**
- Open source availability: 40 points
- License permissiveness: 60 points

**Integration Score (0-100):**
- API availability: 70 points (closed source) or 30 points (open source)
- Provider support quality: 30 points
""")