Upload 6 files
Browse files- app.py +441 -0
- config.py +50 -0
- data_loader.py +243 -0
- dev_tools.py +102 -0
- requirements.txt +6 -0
- scoring.py +203 -0
app.py
ADDED
|
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.graph_objects as go
|
| 5 |
+
import tempfile
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
from data_loader import loader
|
| 9 |
+
from scoring import ScoringEngine, PRESET_CONFIGS, METRIC_MAP
|
| 10 |
+
from dev_tools import DevSuite
|
| 11 |
+
from config import *
|
| 12 |
+
|
| 13 |
+
# Lazy loading state
|
| 14 |
+
_CACHED_DF = None
|
| 15 |
+
|
| 16 |
+
def get_dataframe():
|
| 17 |
+
"""Lazy load dataframe."""
|
| 18 |
+
global _CACHED_DF
|
| 19 |
+
if _CACHED_DF is None:
|
| 20 |
+
if FORCE_REFRESH_ON_STARTUP:
|
| 21 |
+
print("๐ First load: Clearing cache...")
|
| 22 |
+
loader.clear_cache()
|
| 23 |
+
df = loader.load_data()
|
| 24 |
+
if not df.empty:
|
| 25 |
+
_CACHED_DF = ScoringEngine(df).calculate_all()
|
| 26 |
+
else:
|
| 27 |
+
_CACHED_DF = df
|
| 28 |
+
return _CACHED_DF
|
| 29 |
+
|
| 30 |
+
def format_params(row):
|
| 31 |
+
total = row.get('Total Parameters', 0)
|
| 32 |
+
active = row.get('Active Parameters', 0)
|
| 33 |
+
if pd.isna(total) or total <= 0:
|
| 34 |
+
return "N/A"
|
| 35 |
+
def fmt(x):
|
| 36 |
+
try:
|
| 37 |
+
val = float(x)
|
| 38 |
+
if val <= 0: return "?"
|
| 39 |
+
if val < 1: return f"{val*1000:.0f}M"
|
| 40 |
+
return f"{val:.1f}B"
|
| 41 |
+
except:
|
| 42 |
+
return "?"
|
| 43 |
+
if pd.isna(active) or active <= 0 or active == total:
|
| 44 |
+
return fmt(total)
|
| 45 |
+
else:
|
| 46 |
+
return f"{fmt(total)} (Act: {fmt(active)})"
|
| 47 |
+
|
| 48 |
+
def escape_markdown(text):
|
| 49 |
+
return re.sub(r'([\[\]()\*_#~`])', r'\\\1', str(text))
|
| 50 |
+
|
| 51 |
+
def format_model_link(row):
|
| 52 |
+
name = str(row.get('author/model_name', 'Unknown'))
|
| 53 |
+
link = row.get('Model Link', '')
|
| 54 |
+
safe_name = escape_markdown(name)
|
| 55 |
+
if pd.notna(link) and isinstance(link, str) and link.startswith('http'):
|
| 56 |
+
return f"[{safe_name}]({link})"
|
| 57 |
+
return safe_name
|
| 58 |
+
|
| 59 |
+
def get_architecture_choices(df):
|
| 60 |
+
if df is None or df.empty:
|
| 61 |
+
return []
|
| 62 |
+
valid_archs = [a for a in df['Architecture'].dropna().unique()
|
| 63 |
+
if str(a).lower() not in ['unknown', 'nan', 'null', 'none']]
|
| 64 |
+
return sorted(valid_archs)
|
| 65 |
+
|
| 66 |
+
def filter_leaderboard(df, preset, query, param_min, param_max, proprietary,
|
| 67 |
+
moe_only, thinking_mode, model_types, architecture, top_n,
|
| 68 |
+
balance_filter):
|
| 69 |
+
if df is None or df.empty:
|
| 70 |
+
return pd.DataFrame(), pd.DataFrame()
|
| 71 |
+
|
| 72 |
+
mask = pd.Series(True, index=df.index)
|
| 73 |
+
|
| 74 |
+
# Search
|
| 75 |
+
if query:
|
| 76 |
+
search_mask = (
|
| 77 |
+
df['author/model_name'].astype(str).str.contains(query, case=False, na=False) |
|
| 78 |
+
df['Architecture'].astype(str).str.contains(query, case=False, na=False)
|
| 79 |
+
)
|
| 80 |
+
mask &= search_mask
|
| 81 |
+
|
| 82 |
+
# Preset filtering (Pocket Genius)
|
| 83 |
+
if preset == "๐ค Pocket Genius":
|
| 84 |
+
mask &= (df['Total Parameters'] <= 12.0)
|
| 85 |
+
|
| 86 |
+
# Params filtering
|
| 87 |
+
has_params = df['Total Parameters'].notna() & (df['Total Parameters'] > 0)
|
| 88 |
+
in_range = (df['Total Parameters'] >= param_min) & (df['Total Parameters'] <= param_max)
|
| 89 |
+
|
| 90 |
+
if proprietary:
|
| 91 |
+
mask &= (has_params & in_range) | ~has_params
|
| 92 |
+
else:
|
| 93 |
+
mask &= has_params & in_range
|
| 94 |
+
|
| 95 |
+
if moe_only:
|
| 96 |
+
mask &= (df['Active Parameters'] < df['Total Parameters'])
|
| 97 |
+
|
| 98 |
+
if thinking_mode == "Hide Thinking":
|
| 99 |
+
mask &= ~df['Is Thinking Model']
|
| 100 |
+
elif thinking_mode == "Only Thinking":
|
| 101 |
+
mask &= df['Is Thinking Model']
|
| 102 |
+
|
| 103 |
+
# Model Types
|
| 104 |
+
type_mask = pd.Series(False, index=df.index)
|
| 105 |
+
for model_type, col in [("Foundation", "Is Foundation"), ("Finetuned", "Is Finetuned"), ("Merged", "Is Merged")]:
|
| 106 |
+
if model_type in model_types and col in df.columns:
|
| 107 |
+
type_mask |= df[col]
|
| 108 |
+
if type_mask.any():
|
| 109 |
+
mask &= type_mask
|
| 110 |
+
|
| 111 |
+
if architecture and architecture != "All":
|
| 112 |
+
mask &= (df['Architecture'] == architecture)
|
| 113 |
+
|
| 114 |
+
# === BALANCE FILTER LOGIC ===
|
| 115 |
+
if balance_filter != "Show All":
|
| 116 |
+
threshold = 0.0
|
| 117 |
+
if "Perfect" in balance_filter: threshold = 0.7
|
| 118 |
+
elif "Good" in balance_filter: threshold = 0.5
|
| 119 |
+
elif "Basic" in balance_filter: threshold = 0.3
|
| 120 |
+
|
| 121 |
+
target_col = "Score_๐ Perfect Balance"
|
| 122 |
+
|
| 123 |
+
if target_col in df.columns:
|
| 124 |
+
mask &= (df[target_col] >= threshold)
|
| 125 |
+
|
| 126 |
+
score_col = f"Score_{preset}"
|
| 127 |
+
|
| 128 |
+
if score_col not in df.columns:
|
| 129 |
+
return pd.DataFrame(), pd.DataFrame()
|
| 130 |
+
|
| 131 |
+
result = df[mask].sort_values(score_col, ascending=False).head(top_n).copy()
|
| 132 |
+
|
| 133 |
+
if result.empty:
|
| 134 |
+
return pd.DataFrame(), pd.DataFrame()
|
| 135 |
+
|
| 136 |
+
export_df = result.copy()
|
| 137 |
+
|
| 138 |
+
# Formatting
|
| 139 |
+
result['Rank'] = range(1, len(result) + 1)
|
| 140 |
+
result['Model Name'] = result.apply(format_model_link, axis=1)
|
| 141 |
+
result['Parameters'] = result.apply(format_params, axis=1)
|
| 142 |
+
result['Architecture'] = result['Architecture'].apply(str)
|
| 143 |
+
result['Date'] = pd.to_datetime(result['Release Date'], errors='coerce').dt.strftime('%Y-%m-%d').fillna('-')
|
| 144 |
+
result = result.rename(columns={score_col: "โญ Score"})
|
| 145 |
+
|
| 146 |
+
display_cols = ['Rank', 'Model Name', "โญ Score", 'Date', 'Badges', 'Parameters', 'Architecture']
|
| 147 |
+
return result[display_cols], export_df
|
| 148 |
+
|
| 149 |
+
def compare_models(df, model_names_text):
|
| 150 |
+
if df is None or not model_names_text:
|
| 151 |
+
return None, pd.DataFrame()
|
| 152 |
+
targets = [x.strip() for x in model_names_text.split('\n') if x.strip()]
|
| 153 |
+
subset = df[df['author/model_name'].isin(targets)].copy()
|
| 154 |
+
if subset.empty:
|
| 155 |
+
return None, pd.DataFrame()
|
| 156 |
+
|
| 157 |
+
metrics = {'Logic': 'Composite_WorldModel', 'Knowledge': 'norm_Textbook', 'Style': 'norm_Style',
|
| 158 |
+
'Roleplay': 'gauss_Dialogue', 'Freedom': 'Composite_Unbound'}
|
| 159 |
+
fig = go.Figure()
|
| 160 |
+
for _, row in subset.iterrows():
|
| 161 |
+
values = []
|
| 162 |
+
for col in metrics.values():
|
| 163 |
+
val = float(row.get(col, 0))
|
| 164 |
+
if abs(val) > 90000: val = 0 # Handle sentinel values
|
| 165 |
+
values.append(val)
|
| 166 |
+
values.append(values[0])
|
| 167 |
+
categories = list(metrics.keys()) + [list(metrics.keys())[0]]
|
| 168 |
+
fig.add_trace(go.Scatterpolar(r=values, theta=categories, fill='toself', name=row['author/model_name'][:30]))
|
| 169 |
+
fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), showlegend=True, height=500)
|
| 170 |
+
|
| 171 |
+
compare_cols = ['author/model_name', 'Total Parameters', 'Score_๐ Divine RP', 'norm_Style', 'Composite_WorldModel']
|
| 172 |
+
compare_df = subset[compare_cols].rename(columns={
|
| 173 |
+
'author/model_name': 'Model', 'Total Parameters': 'Params', 'Score_๐ Divine RP': 'Divine RP',
|
| 174 |
+
'norm_Style': 'Writing Style', 'Composite_WorldModel': 'World Model'
|
| 175 |
+
})
|
| 176 |
+
return fig, compare_df
|
| 177 |
+
|
| 178 |
+
def calculate_custom_score(df, weights_dict):
|
| 179 |
+
if df is None or df.empty:
|
| 180 |
+
return pd.DataFrame()
|
| 181 |
+
temp_engine = ScoringEngine(df.copy())
|
| 182 |
+
df = df.copy()
|
| 183 |
+
df['Custom_Score'] = temp_engine.calculate_weighted_score(weights_dict).round(3)
|
| 184 |
+
result = df.sort_values('Custom_Score', ascending=False).head(50)
|
| 185 |
+
display = result[['author/model_name', 'Custom_Score', 'Total Parameters', 'Badges']].copy()
|
| 186 |
+
display = display.rename(columns={'author/model_name': 'Model', 'Custom_Score': 'โญ Score', 'Total Parameters': 'Params'})
|
| 187 |
+
return display
|
| 188 |
+
|
| 189 |
+
def run_diagnostics(df):
|
| 190 |
+
if df is None or df.empty:
|
| 191 |
+
return "โ No data loaded", pd.DataFrame(), pd.DataFrame()
|
| 192 |
+
dev = DevSuite(df)
|
| 193 |
+
return dev.run_all_tests(), dev.get_anomalies_df(), dev.get_statistics_df()
|
| 194 |
+
|
| 195 |
+
def clear_and_reload():
|
| 196 |
+
global _CACHED_DF
|
| 197 |
+
_CACHED_DF = None
|
| 198 |
+
loader.clear_cache()
|
| 199 |
+
new_df = get_dataframe()
|
| 200 |
+
status = f"โ
Cache cleared!\nDeleted files: data_cache.parquet, meta.json\n๐ Data reloaded: {len(new_df)} rows"
|
| 201 |
+
return new_df, status
|
| 202 |
+
|
| 203 |
+
with gr.Blocks() as demo:
|
| 204 |
+
initial_df = get_dataframe()
|
| 205 |
+
df_state = gr.State(initial_df)
|
| 206 |
+
filtered_raw_state = gr.State()
|
| 207 |
+
|
| 208 |
+
gr.Markdown(f"""
|
| 209 |
+
# ๐ UGI Leaderboard: Presets Edition v3.6
|
| 210 |
+
**Last Updated:** {loader.last_updated} | **Models:** {len(initial_df)} | **PID:** {os.getpid()}
|
| 211 |
+
""")
|
| 212 |
+
|
| 213 |
+
with gr.Tabs():
|
| 214 |
+
with gr.Tab("๐
Leaderboard"):
|
| 215 |
+
# Upper Control Panel
|
| 216 |
+
with gr.Row(variant="panel", equal_height=True):
|
| 217 |
+
with gr.Column(scale=5):
|
| 218 |
+
preset_dropdown = gr.Radio(
|
| 219 |
+
choices=list(PRESET_CONFIGS.keys()) + ["โก Efficiency King", "๐ค Pocket Genius"],
|
| 220 |
+
value="๐ Divine RP",
|
| 221 |
+
label="๐ฏ Preset",
|
| 222 |
+
interactive=True
|
| 223 |
+
)
|
| 224 |
+
with gr.Column(scale=1, min_width=150):
|
| 225 |
+
refresh_btn = gr.Button("๐ Refresh Data", variant="secondary", size="lg")
|
| 226 |
+
|
| 227 |
+
# Filters Accordion
|
| 228 |
+
with gr.Accordion("โ๏ธ Hardware & Filters", open=False):
|
| 229 |
+
with gr.Row():
|
| 230 |
+
param_min = gr.Slider(0, MAX_PARAMS_SLIDER, 0, step=1, label="Min Parameters (B)")
|
| 231 |
+
param_max = gr.Slider(0, MAX_PARAMS_SLIDER, MAX_PARAMS_SLIDER, step=1, label="Max Parameters (B)")
|
| 232 |
+
with gr.Row():
|
| 233 |
+
proprietary_check = gr.Checkbox(value=True, label="Include Proprietary (unknown params)")
|
| 234 |
+
moe_check = gr.Checkbox(value=False, label="MoE Only")
|
| 235 |
+
thinking_mode = gr.Radio(["Show All", "Hide Thinking", "Only Thinking"], value="Show All", label="Reasoning Models")
|
| 236 |
+
with gr.Row():
|
| 237 |
+
model_types = gr.CheckboxGroup(["Foundation", "Finetuned", "Merged"], value=["Foundation", "Finetuned", "Merged"], label="Model Types")
|
| 238 |
+
arch_dropdown = gr.Dropdown(["All"] + get_architecture_choices(initial_df), value="All", label="Architecture")
|
| 239 |
+
top_n_slider = gr.Slider(10, 500, DEFAULT_TOP_N, step=10, label="Top N")
|
| 240 |
+
|
| 241 |
+
# NEW BALANCE FILTER
|
| 242 |
+
with gr.Row():
|
| 243 |
+
balance_filter = gr.Radio(
|
| 244 |
+
choices=["Show All", "๐ Perfect (โฅ0.7)", "๐
Good (โฅ0.5)", "โ๏ธ Basic (โฅ0.3)"],
|
| 245 |
+
value="Show All",
|
| 246 |
+
label="๐ก๏ธ Robustness Filter (Objective Metrics Only)",
|
| 247 |
+
info="Filters out models with weak spots in 13 core metrics (Knowledge, Logic, Syntax)."
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
search_box = gr.Textbox(label="๐ Search Models (name or architecture)", placeholder="e.g., Llama, Qwen, MistralForCausalLM...")
|
| 251 |
+
leaderboard_table = gr.Dataframe(datatype=["number", "markdown", "number", "str", "str", "str", "str"], wrap=True, interactive=False)
|
| 252 |
+
|
| 253 |
+
# Export
|
| 254 |
+
with gr.Row():
|
| 255 |
+
with gr.Column(scale=1):
|
| 256 |
+
export_btn = gr.Button("๐ฅ Export CSV", variant="primary", size="sm")
|
| 257 |
+
with gr.Column(scale=4):
|
| 258 |
+
export_file = gr.File(label="Download CSV", visible=False, height=50)
|
| 259 |
+
|
| 260 |
+
with gr.Tab("โ๏ธ Compare"):
|
| 261 |
+
gr.Markdown("### Compare Multiple Models")
|
| 262 |
+
with gr.Row():
|
| 263 |
+
with gr.Column(scale=2):
|
| 264 |
+
search_compare = gr.Textbox(label="๐ Search to Add Models", placeholder="Type model name...")
|
| 265 |
+
search_results_radio = gr.Radio(choices=[], label="Select from results", interactive=True)
|
| 266 |
+
add_model_btn = gr.Button("โ Add Model", variant="secondary")
|
| 267 |
+
with gr.Column(scale=3):
|
| 268 |
+
compare_textbox = gr.Textbox(label="๐ Comparing (one per line)", lines=8, placeholder="Add models using search...")
|
| 269 |
+
|
| 270 |
+
compare_btn = gr.Button("๐ Generate Comparison", variant="primary")
|
| 271 |
+
with gr.Row():
|
| 272 |
+
radar_plot = gr.Plot(label="๐ Radar Chart")
|
| 273 |
+
compare_table = gr.Dataframe(label="๐ Comparison Table")
|
| 274 |
+
|
| 275 |
+
with gr.Tab("๐จ Custom Weights"):
|
| 276 |
+
gr.Markdown("### Create Your Own Preset")
|
| 277 |
+
gr.Markdown("Adjust weights for each metric (must sum to 1.0)")
|
| 278 |
+
with gr.Row():
|
| 279 |
+
with gr.Column():
|
| 280 |
+
w_textbook = gr.Slider(0, 1, 0.12, step=0.01, label="๐ Textbook Knowledge")
|
| 281 |
+
w_popculture = gr.Slider(0, 1, 0.08, step=0.01, label="๐ฌ Pop Culture")
|
| 282 |
+
w_worldmodel = gr.Slider(0, 1, 0.10, step=0.01, label="๐ World Model")
|
| 283 |
+
w_instruction = gr.Slider(0, 1, 0.10, step=0.01, label="๐ Instruction Following")
|
| 284 |
+
w_style = gr.Slider(0, 1, 0.25, step=0.01, label="โ๏ธ Writing Style")
|
| 285 |
+
with gr.Column():
|
| 286 |
+
w_originality = gr.Slider(0, 1, 0.10, step=0.01, label="โจ Originality")
|
| 287 |
+
w_dialogue = gr.Slider(0, 1, 0.15, step=0.01, label="๐ฌ Dialogue Balance")
|
| 288 |
+
w_unbound = gr.Slider(0, 1, 0.05, step=0.01, label="๐ Unbound")
|
| 289 |
+
w_redundancy = gr.Slider(0, 1, 0.05, step=0.01, label="๐งน Low Redundancy")
|
| 290 |
+
weight_sum_display = gr.Markdown("**Total Weight:** 1.00")
|
| 291 |
+
calc_custom_btn = gr.Button("๐ฏ Calculate Custom Score", variant="primary")
|
| 292 |
+
custom_results = gr.Dataframe(label="Top 50 Models")
|
| 293 |
+
|
| 294 |
+
with gr.Tab("๐ About"):
|
| 295 |
+
gr.Markdown(f"""
|
| 296 |
+
# ๐ About UGI Leaderboard v3.6
|
| 297 |
+
|
| 298 |
+
## ๐ฏ Presets Explained
|
| 299 |
+
|
| 300 |
+
### ๐ Divine RP
|
| 301 |
+
Perfect balance for roleplay and creative storytelling. Emphasizes writing style (25%), dialogue (15%), and world knowledge.
|
| 302 |
+
|
| 303 |
+
### ๐ถ๏ธ Erotic Storyteller
|
| 304 |
+
Optimized for NSFW creative content. High unbound weight (30%), NSFW tone (15%).
|
| 305 |
+
|
| 306 |
+
### ๐ Perfect Balance (NEW)
|
| 307 |
+
**Hybrid Score (Min ร Geometric Mean).** Requires consistency across all objective metrics (Knowledge, Logic, Style, Structure). Rewards models that are "good at everything" and punishes those with even one weak spot.
|
| 308 |
+
|
| 309 |
+
### โ๏ธ No Weak Spots (NEW)
|
| 310 |
+
**Harmonic Mean.** Extremely strict. One failing metric (e.g., poor instruction following) will destroy the entire score, regardless of how good the other metrics are.
|
| 311 |
+
|
| 312 |
+
### ๐ค T-800 Logic
|
| 313 |
+
Pure logic and knowledge. Prioritizes textbook (40%) and world model (35%).
|
| 314 |
+
|
| 315 |
+
### โ๏ธ Literary Virtuoso
|
| 316 |
+
Literary quality above all. Writing style (35%), originality (30%), low redundancy (15%).
|
| 317 |
+
|
| 318 |
+
### ๐ฒ Dungeon Master
|
| 319 |
+
World-building specialist. World model (30%), combined knowledge (30%).
|
| 320 |
+
|
| 321 |
+
### ๐ Dark Novelist
|
| 322 |
+
Dark fiction specialist. Dark tone (25%), writing style (25%), hazardous (15%).
|
| 323 |
+
|
| 324 |
+
### ๐งผ Anti-Slop
|
| 325 |
+
Maximum originality. Fights generic outputs with originality (45%) and redundancy penalties (35%).
|
| 326 |
+
|
| 327 |
+
### ๐ฏ Concise Assistant
|
| 328 |
+
Direct and efficient. Instruction (35%), low redundancy (30%).
|
| 329 |
+
|
| 330 |
+
### ๐ช Entertainment Savant
|
| 331 |
+
Pop culture expert. Pop culture (40%), entertainment (25%).
|
| 332 |
+
|
| 333 |
+
### ๐ฌ Unfiltered Scholar
|
| 334 |
+
Uncensored knowledge. Textbook (30%), hazardous (25%), unbound (20%).
|
| 335 |
+
|
| 336 |
+
### โก Efficiency King
|
| 337 |
+
Best performance per parameter. Calculated as: `Divine RP Score / (Params ^ 0.4)`.
|
| 338 |
+
|
| 339 |
+
---
|
| 340 |
+
|
| 341 |
+
## ๐ท๏ธ Badges Key
|
| 342 |
+
|
| 343 |
+
- **๐** = **Fresh**: Tested within the last 7 days.
|
| 344 |
+
- **๐ง ** = **Thinking**: Uses Chain-of-Thought (CoT) or reasoning tokens.
|
| 345 |
+
- **๐** = **NSFW**: High frequency of explicit content generation.
|
| 346 |
+
- **๐** = **Repetitive**: Detected repetition loops in outputs.
|
| 347 |
+
- **๐ค** = **Pocket**: Efficient model with โค 10B parameters.
|
| 348 |
+
- **๐ณ** = **Giant**: Massive model with โฅ 70B parameters.
|
| 349 |
+
|
| 350 |
+
---
|
| 351 |
+
|
| 352 |
+
## ๐ Scoring System (v3.6 Updated)
|
| 353 |
+
|
| 354 |
+
### 1. Weighted Average (Smart Handling)
|
| 355 |
+
Unlike previous versions that filled missing data with artificial values, **v3.6 uses dynamic re-weighting**.
|
| 356 |
+
- If a model lacks a specific metric (e.g., "Music Theory"), that metric is excluded from the calculation.
|
| 357 |
+
- The weights of the remaining metrics are scaled up proportionally to sum to 1.0.
|
| 358 |
+
- **Penalty:** If a model has data for less than **{int(INSUFFICIENT_DATA_THRESHOLD*100)}%** of the preset's total weight, the final score is multiplied by **{INSUFFICIENT_DATA_PENALTY}**.
|
| 359 |
+
|
| 360 |
+
### 2. Robust Normalization
|
| 361 |
+
Metrics are normalized using the **5th and 95th percentiles** to ignore outliers.
|
| 362 |
+
- `Score = (Value - P5) / (P95 - P5)`
|
| 363 |
+
- This ensures that one extremely high-scoring model doesn't squash everyone else to zero.
|
| 364 |
+
|
| 365 |
+
### 3. Composites
|
| 366 |
+
- **World Model**: Average of Cooking, GeoGuesser, Weight Estimation, and Music Theory.
|
| 367 |
+
- **Unbound**: Average of Direct Refusal (inverse), Entertainment, and Hazardous knowledge.
|
| 368 |
+
- **Redundancy**: Combination of Semantic and Lexical redundancy metrics.
|
| 369 |
+
|
| 370 |
+
---
|
| 371 |
+
|
| 372 |
+
## ๐ Technical Details
|
| 373 |
+
- **Framework**: Gradio 5.x + Pandas + Plotly
|
| 374 |
+
- **Caching**: Data is cached for {int(CACHE_DURATION/3600)} hours to speed up loading.
|
| 375 |
+
- **Filters**: You can now filter by specific model architectures and robustness levels.
|
| 376 |
+
|
| 377 |
+
*Last Updated: {loader.last_updated}*
|
| 378 |
+
""")
|
| 379 |
+
|
| 380 |
+
diag_btn = None
|
| 381 |
+
clear_btn = None
|
| 382 |
+
|
| 383 |
+
if SHOW_DIAGNOSTICS:
|
| 384 |
+
with gr.Tab("๐ ๏ธ Diagnostics"):
|
| 385 |
+
with gr.Row():
|
| 386 |
+
diag_btn = gr.Button("๐งช Run Diagnostics", variant="primary")
|
| 387 |
+
clear_btn = gr.Button("๐๏ธ Clear Cache & Reload Data", variant="stop")
|
| 388 |
+
|
| 389 |
+
cache_status = gr.Textbox(label="Status", lines=3, interactive=False)
|
| 390 |
+
diag_report = gr.Code(label="๐ Diagnostic Report", language="markdown")
|
| 391 |
+
|
| 392 |
+
with gr.Accordion("๐ Anomalies", open=False):
|
| 393 |
+
anomalies_table = gr.Dataframe(label="Detected Anomalies")
|
| 394 |
+
with gr.Accordion("๐ Statistics", open=False):
|
| 395 |
+
stats_table = gr.Dataframe(label="Normalization Statistics")
|
| 396 |
+
|
| 397 |
+
# === INTERACTIONS (BINDING) ===
|
| 398 |
+
|
| 399 |
+
filter_inputs = [df_state, preset_dropdown, search_box, param_min, param_max, proprietary_check,
|
| 400 |
+
moe_check, thinking_mode, model_types, arch_dropdown, top_n_slider, balance_filter]
|
| 401 |
+
filter_outputs = [leaderboard_table, filtered_raw_state]
|
| 402 |
+
|
| 403 |
+
for inp in filter_inputs[1:]:
|
| 404 |
+
inp.change(filter_leaderboard, inputs=filter_inputs, outputs=filter_outputs)
|
| 405 |
+
|
| 406 |
+
def refresh_handler():
|
| 407 |
+
global _CACHED_DF
|
| 408 |
+
_CACHED_DF = None
|
| 409 |
+
loader.clear_cache()
|
| 410 |
+
new_df = get_dataframe()
|
| 411 |
+
return new_df, gr.update(choices=["All"] + get_architecture_choices(new_df))
|
| 412 |
+
|
| 413 |
+
refresh_btn.click(refresh_handler, outputs=[df_state, arch_dropdown]).then(filter_leaderboard, inputs=filter_inputs, outputs=filter_outputs)
|
| 414 |
+
|
| 415 |
+
def export_handler(df):
|
| 416 |
+
if df is None or df.empty:
|
| 417 |
+
return gr.update(value=None, visible=False)
|
| 418 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', mode='w', encoding='utf-8')
|
| 419 |
+
df.to_csv(temp_file.name, index=False)
|
| 420 |
+
return gr.update(value=temp_file.name, visible=True)
|
| 421 |
+
|
| 422 |
+
export_btn.click(export_handler, inputs=[filtered_raw_state], outputs=[export_file])
|
| 423 |
+
|
| 424 |
+
# Compare logic
|
| 425 |
+
search_compare.change(lambda df, q: gr.update(choices=df[df['author/model_name'].str.contains(q, case=False, na=False)]['author/model_name'].head(10).tolist() if q and df is not None else []), inputs=[df_state, search_compare], outputs=[search_results_radio])
|
| 426 |
+
add_model_btn.click(lambda t, s: t + ("\n" if t else "") + s if s else t, inputs=[compare_textbox, search_results_radio], outputs=[compare_textbox])
|
| 427 |
+
compare_btn.click(compare_models, inputs=[df_state, compare_textbox], outputs=[radar_plot, compare_table])
|
| 428 |
+
|
| 429 |
+
# Custom Weights logic
|
| 430 |
+
weight_inputs = [w_textbook, w_popculture, w_worldmodel, w_instruction, w_style, w_originality, w_dialogue, w_unbound, w_redundancy]
|
| 431 |
+
for w in weight_inputs: w.change(lambda *args: f"**Total Weight:** {sum(args):.2f}", inputs=weight_inputs, outputs=[weight_sum_display])
|
| 432 |
+
calc_custom_btn.click(lambda *args: calculate_custom_score(get_dataframe(), {k: v for k, v in zip(['Textbook', 'Pop Culture', 'World Model', 'Instruction', 'Writing Style', 'Originality', 'Dialogue', 'Unbound', 'Redundancy'], args)}), inputs=weight_inputs, outputs=[custom_results])
|
| 433 |
+
|
| 434 |
+
if SHOW_DIAGNOSTICS and diag_btn and clear_btn:
|
| 435 |
+
diag_btn.click(run_diagnostics, inputs=[df_state], outputs=[diag_report, anomalies_table, stats_table])
|
| 436 |
+
clear_btn.click(clear_and_reload, outputs=[df_state, cache_status]).then(filter_leaderboard, inputs=filter_inputs, outputs=filter_outputs)
|
| 437 |
+
|
| 438 |
+
demo.load(filter_leaderboard, inputs=filter_inputs, outputs=filter_outputs)
|
| 439 |
+
|
| 440 |
+
if __name__ == "__main__":
|
| 441 |
+
demo.launch()
|
config.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration constants for UGI Leaderboard."""
|
| 2 |
+
|
| 3 |
+
# Caching
|
| 4 |
+
CSV_URL = "https://huggingface.co/spaces/DontPlanToEnd/UGI-Leaderboard/resolve/main/ugi-leaderboard-data.csv"
|
| 5 |
+
CACHE_FILE = "data_cache.parquet"
|
| 6 |
+
META_FILE = "meta.json"
|
| 7 |
+
CACHE_DURATION = 6 * 3600 # 6 hours
|
| 8 |
+
|
| 9 |
+
# Scoring Penalties
|
| 10 |
+
INSUFFICIENT_DATA_THRESHOLD = 0.70
|
| 11 |
+
INSUFFICIENT_DATA_PENALTY = 0.3
|
| 12 |
+
REPETITION_BASE = 0.85
|
| 13 |
+
THINKING_THRESHOLD = 5000
|
| 14 |
+
THINKING_PENALTY_POWER = 0.5
|
| 15 |
+
|
| 16 |
+
# Gaussian Targets
|
| 17 |
+
GAUSSIAN_DIALOGUE_TARGET = 0.38
|
| 18 |
+
GAUSSIAN_DIALOGUE_SIGMA = 0.15
|
| 19 |
+
GAUSSIAN_VERBNOUN_TARGET = 0.85
|
| 20 |
+
GAUSSIAN_VERBNOUN_SIGMA = 0.2
|
| 21 |
+
|
| 22 |
+
# Normalization
|
| 23 |
+
ROBUST_QUANTILE_LOW = 0.05
|
| 24 |
+
ROBUST_QUANTILE_HIGH = 0.95
|
| 25 |
+
MIN_STD_THRESHOLD = 1e-9
|
| 26 |
+
|
| 27 |
+
# UI Defaults
|
| 28 |
+
MAX_PARAMS_SLIDER = 500
|
| 29 |
+
DEFAULT_TOP_N = 50
|
| 30 |
+
FORCE_REFRESH_ON_STARTUP = True
|
| 31 |
+
|
| 32 |
+
# === DEV MODE ===
|
| 33 |
+
SHOW_DIAGNOSTICS = False
|
| 34 |
+
|
| 35 |
+
# Objective Metrics List for Balance/Robustness Presets
|
| 36 |
+
# FIXED: Removed 'gauss_VerbNoun' due to scale mismatch
|
| 37 |
+
BALANCE_METRICS_LIST = [
|
| 38 |
+
'norm_Textbook', # Knowledge
|
| 39 |
+
'norm_PopCulture', # Culture
|
| 40 |
+
'norm_Recipe', # Logic
|
| 41 |
+
'norm_Geo', # Geography
|
| 42 |
+
'norm_Weight', # Physics
|
| 43 |
+
'norm_Music', # Music
|
| 44 |
+
'norm_Style', # Style
|
| 45 |
+
'norm_Originality', # Originality
|
| 46 |
+
'gauss_Dialogue', # Structure
|
| 47 |
+
'norm_Instruction', # Precision
|
| 48 |
+
'inv_Semantic', # Coherence
|
| 49 |
+
'inv_Lexical' # Variety
|
| 50 |
+
]
|
data_loader.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import os, time, json
|
| 4 |
+
from datetime import datetime, timedelta
|
| 5 |
+
from config import *
|
| 6 |
+
|
| 7 |
+
class DataLoader:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.df, self.last_updated = None, "Unknown"
|
| 10 |
+
|
| 11 |
+
def load_data(self, force_refresh=False):
|
| 12 |
+
"""ะะฐะณััะทะบะฐ ะดะฐะฝะฝัั
ั ะฟะพะดะดะตัะถะบะพะน ะฟัะธะฝัะดะธัะตะปัะฝะพะณะพ ะพะฑะฝะพะฒะปะตะฝะธั."""
|
| 13 |
+
if force_refresh or self._needs_update():
|
| 14 |
+
print("๐ Cache expired or missing. Fetching fresh data...")
|
| 15 |
+
try:
|
| 16 |
+
self.df = self._process_data(pd.read_csv(CSV_URL, on_bad_lines='skip'))
|
| 17 |
+
self._save_cache()
|
| 18 |
+
print(f"โ
Data processed. Rows: {len(self.df)}")
|
| 19 |
+
except Exception as e:
|
| 20 |
+
print(f"โ ๏ธ Error fetching data: {e}")
|
| 21 |
+
self.df = pd.read_parquet(CACHE_FILE) if os.path.exists(CACHE_FILE) else pd.DataFrame()
|
| 22 |
+
self._load_meta()
|
| 23 |
+
else:
|
| 24 |
+
print("โก Loading from cache.")
|
| 25 |
+
self.df = pd.read_parquet(CACHE_FILE)
|
| 26 |
+
self._load_meta()
|
| 27 |
+
return self.df
|
| 28 |
+
|
| 29 |
+
def _needs_update(self):
|
| 30 |
+
"""ะัะพะฒะตัะบะฐ ะฝะตะพะฑั
ะพะดะธะผะพััะธ ะพะฑะฝะพะฒะปะตะฝะธั ะบะตัะฐ."""
|
| 31 |
+
if not os.path.exists(CACHE_FILE) or not os.path.exists(META_FILE):
|
| 32 |
+
return True
|
| 33 |
+
try:
|
| 34 |
+
with open(META_FILE) as f:
|
| 35 |
+
return (time.time() - json.load(f).get('timestamp', 0)) > CACHE_DURATION
|
| 36 |
+
except:
|
| 37 |
+
return True
|
| 38 |
+
|
| 39 |
+
def clear_cache(self):
|
| 40 |
+
"""ะัะธะฝัะดะธัะตะปัะฝะฐั ะพัะธััะบะฐ ะฒัะตั
ัะฐะนะปะพะฒ ะบะตัะฐ."""
|
| 41 |
+
deleted = []
|
| 42 |
+
for file in [CACHE_FILE, META_FILE]:
|
| 43 |
+
if os.path.exists(file):
|
| 44 |
+
try:
|
| 45 |
+
os.remove(file)
|
| 46 |
+
deleted.append(file)
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"โ ๏ธ Failed to delete {file}: {e}")
|
| 49 |
+
if deleted:
|
| 50 |
+
print(f"๐๏ธ Cleared cache: {', '.join(deleted)}")
|
| 51 |
+
return deleted
|
| 52 |
+
|
| 53 |
+
def _save_cache(self):
|
| 54 |
+
self.df.to_parquet(CACHE_FILE)
|
| 55 |
+
with open(META_FILE, 'w') as f:
|
| 56 |
+
json.dump({'timestamp': time.time()}, f)
|
| 57 |
+
self.last_updated = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M")
|
| 58 |
+
|
| 59 |
+
def _load_meta(self):
|
| 60 |
+
try:
|
| 61 |
+
with open(META_FILE) as f:
|
| 62 |
+
self.last_updated = datetime.fromtimestamp(json.load(f)['timestamp']).strftime("%Y-%m-%d %H:%M")
|
| 63 |
+
except:
|
| 64 |
+
pass
|
| 65 |
+
|
| 66 |
+
def _clean_column(self, series, scale=1.0):
|
| 67 |
+
"""ะะตะทะพะฟะฐัะฝะฐั ะพัะธััะบะฐ ะธ ะผะฐัััะฐะฑะธัะพะฒะฐะฝะธะต ัะธัะปะพะฒัั
ะบะพะปะพะฝะพะบ."""
|
| 68 |
+
if pd.api.types.is_string_dtype(series) or series.dtype == 'object':
|
| 69 |
+
series = series.astype(str).str.rstrip('%')
|
| 70 |
+
series = pd.to_numeric(series, errors='coerce')
|
| 71 |
+
return series / scale if scale > 1 else series
|
| 72 |
+
|
| 73 |
+
def _get_model_type(self, row):
|
| 74 |
+
"""ะะฟัะตะดะตะปะตะฝะธะต ัะธะฟะฐ ะผะพะดะตะปะธ ะดะปั ัะพััะธัะพะฒะบะธ."""
|
| 75 |
+
# Returns: (sort_value, short_code, full_name)
|
| 76 |
+
if pd.isna(row.get('Total Parameters')) or row.get('Total Parameters', 0) <= 0:
|
| 77 |
+
return (3, 'P', 'Proprietary')
|
| 78 |
+
|
| 79 |
+
is_foundation = row.get('Is Foundation', False)
|
| 80 |
+
is_merged = row.get('Is Merged', False)
|
| 81 |
+
|
| 82 |
+
if is_foundation and not is_merged:
|
| 83 |
+
return (0, 'B', 'Base')
|
| 84 |
+
if is_merged:
|
| 85 |
+
return (2, 'M', 'Merge')
|
| 86 |
+
if row.get('Is Finetuned', False) and not is_merged:
|
| 87 |
+
return (1, 'F', 'Finetune')
|
| 88 |
+
|
| 89 |
+
return (4, '', 'Unknown')
|
| 90 |
+
|
| 91 |
+
def _process_data(self, df):
|
| 92 |
+
"""ะัะฝะพะฒะฝะพะน ะฟะฐะนะฟะปะฐะนะฝ ะพะฑัะฐะฑะพัะบะธ."""
|
| 93 |
+
print("โ๏ธ Processing pipeline started...")
|
| 94 |
+
df.columns = df.columns.str.strip()
|
| 95 |
+
|
| 96 |
+
# === 1. COLUMN GROUPS ===
|
| 97 |
+
col_groups = {
|
| 98 |
+
'percentage': (['Textbook', 'Pop Culture', 'Dialogue_Percentage', 'Verb_to_Noun_Ratio',
|
| 99 |
+
'Show Rec Correlation', 'avg_length_error_pct'], 100.0),
|
| 100 |
+
'already_norm': (['avg_writing_style_score', 'originality_score', 'internal_semantic_redundancy',
|
| 101 |
+
'lexical_stuckness', 'wm_recipe_percent_error_score', 'wm_geoguessr_mae_score',
|
| 102 |
+
'wm_weight_percent_error_score', 'wm_music_mae_score'], 1.0),
|
| 103 |
+
'numeric': (['Total Parameters', 'Active Parameters', 'Repetition Interrupts', 'Avg Thinking Chars'], 1.0),
|
| 104 |
+
'scale_10': (['avg_nsfw_score', 'avg_dark_score', 'Hazardous', 'Entertainment',
|
| 105 |
+
'SocPol', 'W/10-Direct', 'W/10-Adherence'], 10.0)
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
for group, (cols, scale) in col_groups.items():
|
| 109 |
+
for col in cols:
|
| 110 |
+
if col in df.columns:
|
| 111 |
+
df[col] = self._clean_column(df[col], scale)
|
| 112 |
+
if group == 'already_norm':
|
| 113 |
+
df[col] = df[col].clip(0, 1.0)
|
| 114 |
+
else:
|
| 115 |
+
df[col] = np.nan
|
| 116 |
+
|
| 117 |
+
# === 2. BOOLEANS & STRINGS ===
|
| 118 |
+
if 'Is Thinking Model' in df.columns:
|
| 119 |
+
df['Is Thinking Model'] = (
|
| 120 |
+
df['Is Thinking Model'].astype(str).fillna('FALSE').str.strip().str.upper() == 'TRUE'
|
| 121 |
+
)
|
| 122 |
+
else:
|
| 123 |
+
df['Is Thinking Model'] = False
|
| 124 |
+
|
| 125 |
+
df['Architecture'] = df.get('Architecture', 'Unknown').fillna('Unknown').replace('null', 'Unknown')
|
| 126 |
+
|
| 127 |
+
# === 3. MODEL TYPES & DATES ===
|
| 128 |
+
type_data = df.apply(self._get_model_type, axis=1)
|
| 129 |
+
df['_type_sort'] = type_data.apply(lambda x: x[0])
|
| 130 |
+
df['Type_Code'] = type_data.apply(lambda x: x[1])
|
| 131 |
+
df['Type_Name'] = type_data.apply(lambda x: x[2])
|
| 132 |
+
|
| 133 |
+
if 'Test Date' in df.columns:
|
| 134 |
+
df['Test Date'] = pd.to_datetime(df['Test Date'], format='%m/%d/%Y', errors='coerce')
|
| 135 |
+
week_ago = datetime.now() - timedelta(days=7)
|
| 136 |
+
df['Is_New'] = df['Test Date'].apply(lambda x: True if pd.notna(x) and x >= week_ago else False)
|
| 137 |
+
df['Test Date'] = df['Test Date'].dt.strftime('%Y-%m-%d')
|
| 138 |
+
else:
|
| 139 |
+
df['Is_New'] = False
|
| 140 |
+
|
| 141 |
+
# === 4. PENALTIES ===
|
| 142 |
+
df['penalty_repetition'] = REPETITION_BASE ** df['Repetition Interrupts'].fillna(0)
|
| 143 |
+
|
| 144 |
+
chars = df['Avg Thinking Chars'].fillna(0)
|
| 145 |
+
df['penalty_thinking'] = np.where(
|
| 146 |
+
df['Is Thinking Model'] & (chars > THINKING_THRESHOLD),
|
| 147 |
+
np.power(THINKING_THRESHOLD / (chars + 1e-6), THINKING_PENALTY_POWER).clip(upper=1.0),
|
| 148 |
+
1.0
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# === 5. GAUSSIAN SCORES ===
|
| 152 |
+
df['gauss_Dialogue'] = self._gaussian_score(df['Dialogue_Percentage'], GAUSSIAN_DIALOGUE_TARGET, GAUSSIAN_DIALOGUE_SIGMA)
|
| 153 |
+
df['gauss_VerbNoun'] = self._gaussian_score(df['Verb_to_Noun_Ratio'], GAUSSIAN_VERBNOUN_TARGET, GAUSSIAN_VERBNOUN_SIGMA)
|
| 154 |
+
|
| 155 |
+
# === 6. NORMALIZATION ===
|
| 156 |
+
norm_config = {
|
| 157 |
+
# Direct normalization (Higher = Better)
|
| 158 |
+
'norm_Textbook': ('Textbook', 'direct'),
|
| 159 |
+
'norm_PopCulture': ('Pop Culture', 'direct'),
|
| 160 |
+
'norm_ShowRec': ('Show Rec Correlation', 'direct'),
|
| 161 |
+
'norm_Style': ('avg_writing_style_score', 'direct'),
|
| 162 |
+
'norm_Originality': ('originality_score', 'direct'),
|
| 163 |
+
'norm_NSFW': ('avg_nsfw_score', 'direct'),
|
| 164 |
+
'norm_Dark': ('avg_dark_score', 'direct'),
|
| 165 |
+
'norm_Hazardous': ('Hazardous', 'direct'),
|
| 166 |
+
'norm_Entertainment': ('Entertainment', 'direct'),
|
| 167 |
+
'norm_Instruction': ('W/10-Adherence', 'direct'),
|
| 168 |
+
'norm_Unbound_Direct': ('W/10-Direct', 'direct'),
|
| 169 |
+
# World Model (Direct)
|
| 170 |
+
'norm_Recipe': ('wm_recipe_percent_error_score', 'direct'),
|
| 171 |
+
'norm_Geo': ('wm_geoguessr_mae_score', 'direct'),
|
| 172 |
+
'norm_Weight': ('wm_weight_percent_error_score', 'direct'),
|
| 173 |
+
'norm_Music': ('wm_music_mae_score', 'direct'),
|
| 174 |
+
# Inverse normalization (Higher = Worse)
|
| 175 |
+
'inv_Semantic': ('internal_semantic_redundancy', 'inverse'),
|
| 176 |
+
'inv_Lexical': ('lexical_stuckness', 'inverse'),
|
| 177 |
+
'inv_LengthErr': ('avg_length_error_pct', 'inverse')
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
for dest, (src, mode) in norm_config.items():
|
| 181 |
+
if src in df.columns:
|
| 182 |
+
df[dest] = self._inverse_normalize(df[src]) if mode == 'inverse' else self._robust_normalize(df[src])
|
| 183 |
+
else:
|
| 184 |
+
df[dest] = np.nan
|
| 185 |
+
|
| 186 |
+
# === 7. COMPOSITES ===
|
| 187 |
+
composites = {
|
| 188 |
+
'Composite_WorldModel': ['norm_Recipe', 'norm_Geo', 'norm_Weight', 'norm_Music'],
|
| 189 |
+
'Composite_Unbound': ['norm_Unbound_Direct', 'norm_Entertainment', 'norm_Hazardous'],
|
| 190 |
+
'Composite_Redundancy': ['inv_Semantic', 'inv_Lexical']
|
| 191 |
+
}
|
| 192 |
+
for comp, cols in composites.items():
|
| 193 |
+
df[comp] = df[cols].mean(axis=1, skipna=False)
|
| 194 |
+
|
| 195 |
+
# === 8. SMART NA FILLING (For Sorting) ===
|
| 196 |
+
print("๐ง Applying smart NA handling for sorting...")
|
| 197 |
+
higher_is_better = [
|
| 198 |
+
'Show Rec Correlation', 'norm_Textbook', 'norm_PopCulture', 'norm_ShowRec',
|
| 199 |
+
'norm_Style', 'norm_Originality', 'Composite_WorldModel', 'Composite_Unbound',
|
| 200 |
+
'norm_Recipe', 'norm_Geo', 'norm_Weight', 'norm_Music'
|
| 201 |
+
]
|
| 202 |
+
for col in higher_is_better:
|
| 203 |
+
if col in df.columns:
|
| 204 |
+
df[col] = df[col].fillna(-99999)
|
| 205 |
+
|
| 206 |
+
lower_is_better = [
|
| 207 |
+
'avg_length_error_pct', 'internal_semantic_redundancy', 'lexical_stuckness',
|
| 208 |
+
'inv_Semantic', 'inv_Lexical', 'inv_LengthErr'
|
| 209 |
+
]
|
| 210 |
+
for col in lower_is_better:
|
| 211 |
+
if col in df.columns:
|
| 212 |
+
df[col] = df[col].fillna(99999)
|
| 213 |
+
|
| 214 |
+
print("โ
Processing complete!")
|
| 215 |
+
return df
|
| 216 |
+
|
| 217 |
+
def _robust_normalize(self, series):
|
| 218 |
+
"""Robust normalization with divide-by-zero protection."""
|
| 219 |
+
valid = series.dropna()
|
| 220 |
+
if valid.empty or valid.std() < MIN_STD_THRESHOLD:
|
| 221 |
+
return pd.Series(np.nan, index=series.index)
|
| 222 |
+
q05, q95 = valid.quantile(ROBUST_QUANTILE_LOW), valid.quantile(ROBUST_QUANTILE_HIGH)
|
| 223 |
+
denominator = q95 - q05
|
| 224 |
+
if abs(denominator) < MIN_STD_THRESHOLD:
|
| 225 |
+
return pd.Series(np.nan, index=series.index)
|
| 226 |
+
return (series.clip(q05, q95) - q05) / denominator
|
| 227 |
+
|
| 228 |
+
def _inverse_normalize(self, series):
|
| 229 |
+
"""Inverse robust normalization."""
|
| 230 |
+
valid = series.dropna()
|
| 231 |
+
if valid.empty or valid.std() < MIN_STD_THRESHOLD:
|
| 232 |
+
return pd.Series(np.nan, index=series.index)
|
| 233 |
+
p5, p95 = valid.quantile(ROBUST_QUANTILE_LOW), valid.quantile(ROBUST_QUANTILE_HIGH)
|
| 234 |
+
denominator = p95 - p5
|
| 235 |
+
if abs(denominator) < MIN_STD_THRESHOLD:
|
| 236 |
+
return pd.Series(np.nan, index=series.index)
|
| 237 |
+
return (p95 - series.clip(p5, p95)) / denominator
|
| 238 |
+
|
| 239 |
+
def _gaussian_score(self, series, target, sigma):
|
| 240 |
+
return np.exp(-((series - target) ** 2) / (2 * sigma ** 2))
|
| 241 |
+
|
| 242 |
+
# Create instance
|
| 243 |
+
loader = DataLoader()
|
dev_tools.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from scoring import METRIC_MAP, PRESET_CONFIGS
|
| 5 |
+
|
| 6 |
+
class DevSuite:
|
| 7 |
+
def __init__(self, df):
|
| 8 |
+
self.df = df
|
| 9 |
+
self.report = {
|
| 10 |
+
"summary": {"critical": 0, "medium": 0, "low": 0, "tests_passed": 0},
|
| 11 |
+
"critical_issues": [], "medium_issues": [], "low_issues": [], "anomalies": [], "statistics": {}
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
def run_all_tests(self):
|
| 15 |
+
if self.df is None or self.df.empty:
|
| 16 |
+
self._add_issue("critical", "DataFrame is empty or None.")
|
| 17 |
+
return self._generate_markdown_report()
|
| 18 |
+
|
| 19 |
+
self._test_normalization_bounds()
|
| 20 |
+
self._test_parameter_scaling()
|
| 21 |
+
self._test_badges_logic()
|
| 22 |
+
self._test_weight_sums()
|
| 23 |
+
self._test_score_ranges()
|
| 24 |
+
self._collect_normalization_stats()
|
| 25 |
+
|
| 26 |
+
return self._generate_markdown_report()
|
| 27 |
+
|
| 28 |
+
def get_anomalies_df(self):
|
| 29 |
+
return pd.DataFrame(self.report["anomalies"]) if self.report["anomalies"] else pd.DataFrame()
|
| 30 |
+
|
| 31 |
+
def get_statistics_df(self):
|
| 32 |
+
return pd.DataFrame(self.report["statistics"]).T if self.report["statistics"] else pd.DataFrame()
|
| 33 |
+
|
| 34 |
+
def _test_normalization_bounds(self):
|
| 35 |
+
norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns]
|
| 36 |
+
for col in norm_cols:
|
| 37 |
+
# Ignore sentinel values
|
| 38 |
+
values = self.df[col].dropna()
|
| 39 |
+
values = values[values.abs() < 90000]
|
| 40 |
+
if values.empty: continue
|
| 41 |
+
if values.min() < -1e-6 or values.max() > 1.0 + 1e-6:
|
| 42 |
+
self._add_issue("critical", f"Normalization bounds broken in '{col}'")
|
| 43 |
+
else:
|
| 44 |
+
self.report["summary"]["tests_passed"] += 1
|
| 45 |
+
|
| 46 |
+
def _test_parameter_scaling(self):
|
| 47 |
+
if 'Total Parameters' in self.df.columns:
|
| 48 |
+
# Check a known big model
|
| 49 |
+
big_model = self.df[self.df['author/model_name'].str.contains("Llama-3.1-405B", case=False, na=False)]
|
| 50 |
+
if not big_model.empty and big_model.iloc[0]['Total Parameters'] < 400:
|
| 51 |
+
self._add_issue("critical", "Parameter scaling issue: 405B model appears small.")
|
| 52 |
+
else:
|
| 53 |
+
self.report["summary"]["tests_passed"] += 1
|
| 54 |
+
|
| 55 |
+
def _test_badges_logic(self):
|
| 56 |
+
if 'Badges' in self.df.columns:
|
| 57 |
+
pocket = self.df[self.df['Badges'].str.contains("๐ค")]
|
| 58 |
+
if not pocket.empty and pocket['Total Parameters'].max() > 15:
|
| 59 |
+
self._add_issue("medium", "Pocket badge assigned to large model.")
|
| 60 |
+
else:
|
| 61 |
+
self.report["summary"]["tests_passed"] += 1
|
| 62 |
+
|
| 63 |
+
def _test_weight_sums(self):
|
| 64 |
+
for preset, weights in PRESET_CONFIGS.items():
|
| 65 |
+
if isinstance(weights, dict) and 'special_type' not in weights:
|
| 66 |
+
if abs(sum(weights.values()) - 1.0) > 1e-4:
|
| 67 |
+
self._add_issue("medium", f"Preset '{preset}' weights != 1.0")
|
| 68 |
+
else:
|
| 69 |
+
self.report["summary"]["tests_passed"] += 1
|
| 70 |
+
|
| 71 |
+
def _test_score_ranges(self):
|
| 72 |
+
score_cols = [c for c in self.df.columns if c.startswith("Score_")]
|
| 73 |
+
for col in score_cols:
|
| 74 |
+
if 'Efficiency' in col: continue
|
| 75 |
+
vals = self.df[col].dropna()
|
| 76 |
+
if not vals.empty and (vals.min() < 0 or vals.max() > 1.1):
|
| 77 |
+
self._add_issue("medium", f"Score out of range in {col}")
|
| 78 |
+
else:
|
| 79 |
+
self.report["summary"]["tests_passed"] += 1
|
| 80 |
+
|
| 81 |
+
def _collect_normalization_stats(self):
|
| 82 |
+
norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns]
|
| 83 |
+
for col in norm_cols:
|
| 84 |
+
values = self.df[col].dropna()
|
| 85 |
+
values = values[values.abs() < 90000]
|
| 86 |
+
self.report["statistics"][col] = {
|
| 87 |
+
"min": float(values.min()) if not values.empty else 0,
|
| 88 |
+
"max": float(values.max()) if not values.empty else 0,
|
| 89 |
+
"mean": float(values.mean()) if not values.empty else 0
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
def _add_issue(self, level, message):
|
| 93 |
+
self.report["summary"][level] += 1
|
| 94 |
+
self.report[f"{level}_issues"].append(message)
|
| 95 |
+
|
| 96 |
+
def _generate_markdown_report(self):
|
| 97 |
+
r = self.report
|
| 98 |
+
md = [f"## Executive Summary\n- Passed: {r['summary']['tests_passed']}\n- Critical: {r['summary']['critical']}"]
|
| 99 |
+
if r['critical_issues']:
|
| 100 |
+
md.append("### Critical Issues")
|
| 101 |
+
md.extend([f"- {i}" for i in r['critical_issues']])
|
| 102 |
+
return "\n".join(md)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.0.0
|
| 2 |
+
pandas>=2.0.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
pyarrow>=12.0.0
|
| 5 |
+
requests>=2.0.0
|
| 6 |
+
plotly>=5.0.0
|
scoring.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from config import *
|
| 4 |
+
|
| 5 |
+
METRIC_MAP = {
|
| 6 |
+
'Textbook': ('norm_Textbook', 'Textbook'),
|
| 7 |
+
'Pop Culture': ('norm_PopCulture', 'Pop Culture'),
|
| 8 |
+
'World Model': ('Composite_WorldModel', 'Composite_WorldModel'),
|
| 9 |
+
'Instruction': ('norm_Instruction', 'W/10-Adherence'),
|
| 10 |
+
'Writing Style': ('norm_Style', 'avg_writing_style_score'),
|
| 11 |
+
'Originality': ('norm_Originality', 'originality_score'),
|
| 12 |
+
'Dialogue': ('gauss_Dialogue', 'Dialogue_Percentage'),
|
| 13 |
+
'Unbound': ('Composite_Unbound', 'Composite_Unbound'),
|
| 14 |
+
'NSFW Tone': ('norm_NSFW', 'avg_nsfw_score'),
|
| 15 |
+
'Dark Tone': ('norm_Dark', 'avg_dark_score'),
|
| 16 |
+
'Redundancy': ('Composite_Redundancy', 'Composite_Redundancy'),
|
| 17 |
+
'Hazardous': ('norm_Hazardous', 'Hazardous'),
|
| 18 |
+
'Entertainment': ('norm_Entertainment', 'Entertainment'),
|
| 19 |
+
'Length Acc': ('inv_LengthErr', 'avg_length_error_pct'),
|
| 20 |
+
'VerbNoun': ('gauss_VerbNoun', 'Verb_to_Noun_Ratio')
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
PRESET_CONFIGS = {
|
| 24 |
+
"๐ Divine RP": {
|
| 25 |
+
'Textbook': 0.12, 'Pop Culture': 0.08, 'World Model': 0.10,
|
| 26 |
+
'Instruction': 0.10, 'Writing Style': 0.25, 'Originality': 0.10,
|
| 27 |
+
'Dialogue': 0.15, 'Unbound': 0.05, 'Redundancy': 0.05
|
| 28 |
+
},
|
| 29 |
+
"๐ถ๏ธ Erotic Storyteller": {
|
| 30 |
+
'World Model': 0.10, 'Instruction': 0.05, 'Writing Style': 0.15,
|
| 31 |
+
'Originality': 0.05, 'Dialogue': 0.15, 'Unbound': 0.30,
|
| 32 |
+
'NSFW Tone': 0.15, 'Redundancy': 0.05
|
| 33 |
+
},
|
| 34 |
+
"๐ค T-800 Logic": {
|
| 35 |
+
'Textbook': 0.40, 'World Model': 0.35, 'Instruction': 0.20, 'Redundancy': 0.05
|
| 36 |
+
},
|
| 37 |
+
"โ๏ธ Literary Virtuoso": {
|
| 38 |
+
'Writing Style': 0.35, 'Originality': 0.30, 'Redundancy': 0.15,
|
| 39 |
+
'Instruction': 0.10, 'Dialogue': 0.10
|
| 40 |
+
},
|
| 41 |
+
"๐ฒ Dungeon Master": {
|
| 42 |
+
'World Model': 0.30, 'Textbook': 0.15, 'Pop Culture': 0.15,
|
| 43 |
+
'Instruction': 0.20, 'Originality': 0.10, 'Dialogue': 0.10
|
| 44 |
+
},
|
| 45 |
+
"๐ Dark Novelist": {
|
| 46 |
+
'Dark Tone': 0.25, 'Writing Style': 0.25, 'Hazardous': 0.15,
|
| 47 |
+
'Originality': 0.20, 'Unbound': 0.15
|
| 48 |
+
},
|
| 49 |
+
"๐งผ Anti-Slop": {
|
| 50 |
+
'Originality': 0.45, 'Redundancy': 0.35, 'Writing Style': 0.10, 'Instruction': 0.10
|
| 51 |
+
},
|
| 52 |
+
"๐ฏ Concise Assistant": {
|
| 53 |
+
'Instruction': 0.35, 'Redundancy': 0.30, 'Textbook': 0.20,
|
| 54 |
+
'World Model': 0.10, 'Dialogue': 0.05
|
| 55 |
+
},
|
| 56 |
+
"๐ช Entertainment Savant": {
|
| 57 |
+
'Pop Culture': 0.40, 'Entertainment': 0.25, 'Instruction': 0.15,
|
| 58 |
+
'Writing Style': 0.10, 'Dialogue': 0.10
|
| 59 |
+
},
|
| 60 |
+
"๐ฌ Unfiltered Scholar": {
|
| 61 |
+
'Textbook': 0.30, 'Hazardous': 0.25, 'Unbound': 0.20,
|
| 62 |
+
'Instruction': 0.15, 'Originality': 0.05, 'Redundancy': 0.05
|
| 63 |
+
},
|
| 64 |
+
# === BALANCE PRESETS ===
|
| 65 |
+
"๐ Perfect Balance": {
|
| 66 |
+
'special_type': 'balanced',
|
| 67 |
+
'metrics': BALANCE_METRICS_LIST
|
| 68 |
+
},
|
| 69 |
+
"โ๏ธ No Weak Spots": {
|
| 70 |
+
'special_type': 'harmonic',
|
| 71 |
+
'metrics': BALANCE_METRICS_LIST
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
class ScoringEngine:
|
| 76 |
+
def __init__(self, df):
|
| 77 |
+
self.df = df.copy()
|
| 78 |
+
|
| 79 |
+
def calculate_all(self):
|
| 80 |
+
if self.df.empty:
|
| 81 |
+
return self.df
|
| 82 |
+
|
| 83 |
+
print("๐งฎ Calculating scores...")
|
| 84 |
+
|
| 85 |
+
for preset_name, config in PRESET_CONFIGS.items():
|
| 86 |
+
col_name = f"Score_{preset_name}"
|
| 87 |
+
|
| 88 |
+
if isinstance(config, dict) and 'special_type' in config:
|
| 89 |
+
if config['special_type'] == 'balanced':
|
| 90 |
+
self.df[col_name] = self._calculate_balanced_score(config['metrics'])
|
| 91 |
+
elif config['special_type'] == 'harmonic':
|
| 92 |
+
self.df[col_name] = self._calculate_harmonic_score(config['metrics'])
|
| 93 |
+
else:
|
| 94 |
+
self.df[col_name] = self.calculate_weighted_score(config)
|
| 95 |
+
|
| 96 |
+
# Efficiency King
|
| 97 |
+
params = self.df['Total Parameters'].fillna(0).replace(0, 9999)
|
| 98 |
+
base_score = self.df.get("Score_๐ Divine RP", 0)
|
| 99 |
+
divisor = np.power(params, 0.4)
|
| 100 |
+
self.df["Score_โก Efficiency King"] = (base_score / divisor) * 10
|
| 101 |
+
self.df["Score_โก Efficiency King"] = self.df["Score_โก Efficiency King"].fillna(0)
|
| 102 |
+
|
| 103 |
+
self.df["Score_๐ค Pocket Genius"] = self.df.get("Score_๐ Divine RP", 0)
|
| 104 |
+
|
| 105 |
+
self._generate_badges_vectorized()
|
| 106 |
+
|
| 107 |
+
# Round scores
|
| 108 |
+
score_cols = [c for c in self.df.columns if c.startswith("Score_")]
|
| 109 |
+
self.df[score_cols] = self.df[score_cols].round(3)
|
| 110 |
+
|
| 111 |
+
return self.df
|
| 112 |
+
|
| 113 |
+
def calculate_weighted_score(self, weights_dict):
|
| 114 |
+
"""Standard weighted average logic ignoring sentinel values."""
|
| 115 |
+
weighted_sum = pd.Series(0.0, index=self.df.index)
|
| 116 |
+
total_valid_weight = pd.Series(0.0, index=self.df.index)
|
| 117 |
+
total_preset_weight = sum(weights_dict.values())
|
| 118 |
+
|
| 119 |
+
for key, weight in weights_dict.items():
|
| 120 |
+
if key not in METRIC_MAP: continue
|
| 121 |
+
norm_col, _ = METRIC_MAP[key]
|
| 122 |
+
if norm_col not in self.df.columns: continue
|
| 123 |
+
|
| 124 |
+
values = self.df[norm_col]
|
| 125 |
+
mask = values.notna() & (values.abs() < 90000)
|
| 126 |
+
|
| 127 |
+
weighted_sum[mask] += values[mask] * weight
|
| 128 |
+
total_valid_weight[mask] += weight
|
| 129 |
+
|
| 130 |
+
final_score = weighted_sum / total_valid_weight.replace(0, np.nan)
|
| 131 |
+
final_score = final_score.fillna(0.0)
|
| 132 |
+
|
| 133 |
+
# Penalties
|
| 134 |
+
valid_weight_ratio = total_valid_weight / total_preset_weight
|
| 135 |
+
insufficient_mask = valid_weight_ratio < INSUFFICIENT_DATA_THRESHOLD
|
| 136 |
+
final_score[insufficient_mask] *= INSUFFICIENT_DATA_PENALTY
|
| 137 |
+
|
| 138 |
+
self._apply_global_penalties(final_score)
|
| 139 |
+
return final_score
|
| 140 |
+
|
| 141 |
+
def _calculate_balanced_score(self, metric_keys):
|
| 142 |
+
"""Hybrid: sqrt(min) * sqrt(geometric_mean)."""
|
| 143 |
+
return self._calculate_special_score(metric_keys, method='hybrid')
|
| 144 |
+
|
| 145 |
+
def _calculate_harmonic_score(self, metric_keys):
|
| 146 |
+
"""Harmonic Mean."""
|
| 147 |
+
return self._calculate_special_score(metric_keys, method='harmonic')
|
| 148 |
+
|
| 149 |
+
def _calculate_special_score(self, metric_keys, method):
|
| 150 |
+
cols_to_use = [col for col in metric_keys if col in self.df.columns]
|
| 151 |
+
if not cols_to_use:
|
| 152 |
+
return pd.Series(0.0, index=self.df.index)
|
| 153 |
+
|
| 154 |
+
subset = self.df[cols_to_use].copy()
|
| 155 |
+
|
| 156 |
+
# Filter out sentinel values
|
| 157 |
+
for col in subset.columns:
|
| 158 |
+
subset[col] = subset[col].where(subset[col].abs() < 90000)
|
| 159 |
+
|
| 160 |
+
# FIXED: Soft NaN handling ( (0.3 + median)/2 )
|
| 161 |
+
for col in subset.columns:
|
| 162 |
+
col_median = subset[col].median()
|
| 163 |
+
if pd.isna(col_median) or col_median <= 0:
|
| 164 |
+
fill_val = 0.3
|
| 165 |
+
else:
|
| 166 |
+
fill_val = (0.3 + col_median) / 2
|
| 167 |
+
subset[col] = subset[col].fillna(fill_val)
|
| 168 |
+
|
| 169 |
+
# FIXED: Softer clip (0.1 instead of 0.01)
|
| 170 |
+
subset = subset.clip(lower=0.1, upper=1.0)
|
| 171 |
+
|
| 172 |
+
if method == 'hybrid':
|
| 173 |
+
min_score = subset.min(axis=1)
|
| 174 |
+
log_mean = np.log(subset).mean(axis=1)
|
| 175 |
+
geom_score = np.exp(log_mean)
|
| 176 |
+
final_score = np.sqrt(min_score) * np.sqrt(geom_score)
|
| 177 |
+
|
| 178 |
+
elif method == 'harmonic':
|
| 179 |
+
n = len(cols_to_use)
|
| 180 |
+
sum_inverse = (1.0 / subset).sum(axis=1)
|
| 181 |
+
final_score = n / sum_inverse
|
| 182 |
+
|
| 183 |
+
self._apply_global_penalties(final_score)
|
| 184 |
+
return final_score
|
| 185 |
+
|
| 186 |
+
def _apply_global_penalties(self, score_series):
|
| 187 |
+
if 'penalty_repetition' in self.df.columns:
|
| 188 |
+
score_series *= self.df['penalty_repetition'].fillna(1.0)
|
| 189 |
+
if 'penalty_thinking' in self.df.columns:
|
| 190 |
+
score_series *= self.df['penalty_thinking'].fillna(1.0)
|
| 191 |
+
|
| 192 |
+
def _generate_badges_vectorized(self):
|
| 193 |
+
badges = pd.Series("", index=self.df.index)
|
| 194 |
+
if 'Is_New' in self.df: badges += np.where(self.df['Is_New'], "๐ ", "")
|
| 195 |
+
if 'Is Thinking Model' in self.df: badges += np.where(self.df['Is Thinking Model'], "๐ง ", "")
|
| 196 |
+
if 'norm_NSFW' in self.df: badges += np.where((self.df['norm_NSFW'] > 0.5) & (self.df['norm_NSFW'] < 90000), "๐ ", "")
|
| 197 |
+
if 'Repetition Interrupts' in self.df: badges += np.where(self.df['Repetition Interrupts'] >= 1.0, "๐ ", "")
|
| 198 |
+
|
| 199 |
+
params = self.df.get('Total Parameters', 999).fillna(999)
|
| 200 |
+
badges += np.where((params > 0) & (params <= 10), "๐ค ", "")
|
| 201 |
+
badges += np.where(params >= 70, "๐ณ ", "")
|
| 202 |
+
|
| 203 |
+
self.df['Badges'] = badges.str.strip()
|