VOIDER commited on
Commit
1c249d8
ยท
verified ยท
1 Parent(s): aaf780a

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +441 -0
  2. config.py +50 -0
  3. data_loader.py +243 -0
  4. dev_tools.py +102 -0
  5. requirements.txt +6 -0
  6. scoring.py +203 -0
app.py ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.graph_objects as go
5
+ import tempfile
6
+ import os
7
+ import re
8
+ from data_loader import loader
9
+ from scoring import ScoringEngine, PRESET_CONFIGS, METRIC_MAP
10
+ from dev_tools import DevSuite
11
+ from config import *
12
+
13
+ # Lazy loading state
14
+ _CACHED_DF = None
15
+
16
+ def get_dataframe():
17
+ """Lazy load dataframe."""
18
+ global _CACHED_DF
19
+ if _CACHED_DF is None:
20
+ if FORCE_REFRESH_ON_STARTUP:
21
+ print("๐Ÿ”„ First load: Clearing cache...")
22
+ loader.clear_cache()
23
+ df = loader.load_data()
24
+ if not df.empty:
25
+ _CACHED_DF = ScoringEngine(df).calculate_all()
26
+ else:
27
+ _CACHED_DF = df
28
+ return _CACHED_DF
29
+
30
+ def format_params(row):
31
+ total = row.get('Total Parameters', 0)
32
+ active = row.get('Active Parameters', 0)
33
+ if pd.isna(total) or total <= 0:
34
+ return "N/A"
35
+ def fmt(x):
36
+ try:
37
+ val = float(x)
38
+ if val <= 0: return "?"
39
+ if val < 1: return f"{val*1000:.0f}M"
40
+ return f"{val:.1f}B"
41
+ except:
42
+ return "?"
43
+ if pd.isna(active) or active <= 0 or active == total:
44
+ return fmt(total)
45
+ else:
46
+ return f"{fmt(total)} (Act: {fmt(active)})"
47
+
48
+ def escape_markdown(text):
49
+ return re.sub(r'([\[\]()\*_#~`])', r'\\\1', str(text))
50
+
51
+ def format_model_link(row):
52
+ name = str(row.get('author/model_name', 'Unknown'))
53
+ link = row.get('Model Link', '')
54
+ safe_name = escape_markdown(name)
55
+ if pd.notna(link) and isinstance(link, str) and link.startswith('http'):
56
+ return f"[{safe_name}]({link})"
57
+ return safe_name
58
+
59
+ def get_architecture_choices(df):
60
+ if df is None or df.empty:
61
+ return []
62
+ valid_archs = [a for a in df['Architecture'].dropna().unique()
63
+ if str(a).lower() not in ['unknown', 'nan', 'null', 'none']]
64
+ return sorted(valid_archs)
65
+
66
+ def filter_leaderboard(df, preset, query, param_min, param_max, proprietary,
67
+ moe_only, thinking_mode, model_types, architecture, top_n,
68
+ balance_filter):
69
+ if df is None or df.empty:
70
+ return pd.DataFrame(), pd.DataFrame()
71
+
72
+ mask = pd.Series(True, index=df.index)
73
+
74
+ # Search
75
+ if query:
76
+ search_mask = (
77
+ df['author/model_name'].astype(str).str.contains(query, case=False, na=False) |
78
+ df['Architecture'].astype(str).str.contains(query, case=False, na=False)
79
+ )
80
+ mask &= search_mask
81
+
82
+ # Preset filtering (Pocket Genius)
83
+ if preset == "๐Ÿค Pocket Genius":
84
+ mask &= (df['Total Parameters'] <= 12.0)
85
+
86
+ # Params filtering
87
+ has_params = df['Total Parameters'].notna() & (df['Total Parameters'] > 0)
88
+ in_range = (df['Total Parameters'] >= param_min) & (df['Total Parameters'] <= param_max)
89
+
90
+ if proprietary:
91
+ mask &= (has_params & in_range) | ~has_params
92
+ else:
93
+ mask &= has_params & in_range
94
+
95
+ if moe_only:
96
+ mask &= (df['Active Parameters'] < df['Total Parameters'])
97
+
98
+ if thinking_mode == "Hide Thinking":
99
+ mask &= ~df['Is Thinking Model']
100
+ elif thinking_mode == "Only Thinking":
101
+ mask &= df['Is Thinking Model']
102
+
103
+ # Model Types
104
+ type_mask = pd.Series(False, index=df.index)
105
+ for model_type, col in [("Foundation", "Is Foundation"), ("Finetuned", "Is Finetuned"), ("Merged", "Is Merged")]:
106
+ if model_type in model_types and col in df.columns:
107
+ type_mask |= df[col]
108
+ if type_mask.any():
109
+ mask &= type_mask
110
+
111
+ if architecture and architecture != "All":
112
+ mask &= (df['Architecture'] == architecture)
113
+
114
+ # === BALANCE FILTER LOGIC ===
115
+ if balance_filter != "Show All":
116
+ threshold = 0.0
117
+ if "Perfect" in balance_filter: threshold = 0.7
118
+ elif "Good" in balance_filter: threshold = 0.5
119
+ elif "Basic" in balance_filter: threshold = 0.3
120
+
121
+ target_col = "Score_๐Ÿ’Ž Perfect Balance"
122
+
123
+ if target_col in df.columns:
124
+ mask &= (df[target_col] >= threshold)
125
+
126
+ score_col = f"Score_{preset}"
127
+
128
+ if score_col not in df.columns:
129
+ return pd.DataFrame(), pd.DataFrame()
130
+
131
+ result = df[mask].sort_values(score_col, ascending=False).head(top_n).copy()
132
+
133
+ if result.empty:
134
+ return pd.DataFrame(), pd.DataFrame()
135
+
136
+ export_df = result.copy()
137
+
138
+ # Formatting
139
+ result['Rank'] = range(1, len(result) + 1)
140
+ result['Model Name'] = result.apply(format_model_link, axis=1)
141
+ result['Parameters'] = result.apply(format_params, axis=1)
142
+ result['Architecture'] = result['Architecture'].apply(str)
143
+ result['Date'] = pd.to_datetime(result['Release Date'], errors='coerce').dt.strftime('%Y-%m-%d').fillna('-')
144
+ result = result.rename(columns={score_col: "โญ Score"})
145
+
146
+ display_cols = ['Rank', 'Model Name', "โญ Score", 'Date', 'Badges', 'Parameters', 'Architecture']
147
+ return result[display_cols], export_df
148
+
149
+ def compare_models(df, model_names_text):
150
+ if df is None or not model_names_text:
151
+ return None, pd.DataFrame()
152
+ targets = [x.strip() for x in model_names_text.split('\n') if x.strip()]
153
+ subset = df[df['author/model_name'].isin(targets)].copy()
154
+ if subset.empty:
155
+ return None, pd.DataFrame()
156
+
157
+ metrics = {'Logic': 'Composite_WorldModel', 'Knowledge': 'norm_Textbook', 'Style': 'norm_Style',
158
+ 'Roleplay': 'gauss_Dialogue', 'Freedom': 'Composite_Unbound'}
159
+ fig = go.Figure()
160
+ for _, row in subset.iterrows():
161
+ values = []
162
+ for col in metrics.values():
163
+ val = float(row.get(col, 0))
164
+ if abs(val) > 90000: val = 0 # Handle sentinel values
165
+ values.append(val)
166
+ values.append(values[0])
167
+ categories = list(metrics.keys()) + [list(metrics.keys())[0]]
168
+ fig.add_trace(go.Scatterpolar(r=values, theta=categories, fill='toself', name=row['author/model_name'][:30]))
169
+ fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), showlegend=True, height=500)
170
+
171
+ compare_cols = ['author/model_name', 'Total Parameters', 'Score_๐ŸŒŒ Divine RP', 'norm_Style', 'Composite_WorldModel']
172
+ compare_df = subset[compare_cols].rename(columns={
173
+ 'author/model_name': 'Model', 'Total Parameters': 'Params', 'Score_๐ŸŒŒ Divine RP': 'Divine RP',
174
+ 'norm_Style': 'Writing Style', 'Composite_WorldModel': 'World Model'
175
+ })
176
+ return fig, compare_df
177
+
178
+ def calculate_custom_score(df, weights_dict):
179
+ if df is None or df.empty:
180
+ return pd.DataFrame()
181
+ temp_engine = ScoringEngine(df.copy())
182
+ df = df.copy()
183
+ df['Custom_Score'] = temp_engine.calculate_weighted_score(weights_dict).round(3)
184
+ result = df.sort_values('Custom_Score', ascending=False).head(50)
185
+ display = result[['author/model_name', 'Custom_Score', 'Total Parameters', 'Badges']].copy()
186
+ display = display.rename(columns={'author/model_name': 'Model', 'Custom_Score': 'โญ Score', 'Total Parameters': 'Params'})
187
+ return display
188
+
189
+ def run_diagnostics(df):
190
+ if df is None or df.empty:
191
+ return "โŒ No data loaded", pd.DataFrame(), pd.DataFrame()
192
+ dev = DevSuite(df)
193
+ return dev.run_all_tests(), dev.get_anomalies_df(), dev.get_statistics_df()
194
+
195
+ def clear_and_reload():
196
+ global _CACHED_DF
197
+ _CACHED_DF = None
198
+ loader.clear_cache()
199
+ new_df = get_dataframe()
200
+ status = f"โœ… Cache cleared!\nDeleted files: data_cache.parquet, meta.json\n๐Ÿ”„ Data reloaded: {len(new_df)} rows"
201
+ return new_df, status
202
+
203
+ with gr.Blocks() as demo:
204
+ initial_df = get_dataframe()
205
+ df_state = gr.State(initial_df)
206
+ filtered_raw_state = gr.State()
207
+
208
+ gr.Markdown(f"""
209
+ # ๐Ÿ† UGI Leaderboard: Presets Edition v3.6
210
+ **Last Updated:** {loader.last_updated} | **Models:** {len(initial_df)} | **PID:** {os.getpid()}
211
+ """)
212
+
213
+ with gr.Tabs():
214
+ with gr.Tab("๐Ÿ… Leaderboard"):
215
+ # Upper Control Panel
216
+ with gr.Row(variant="panel", equal_height=True):
217
+ with gr.Column(scale=5):
218
+ preset_dropdown = gr.Radio(
219
+ choices=list(PRESET_CONFIGS.keys()) + ["โšก Efficiency King", "๐Ÿค Pocket Genius"],
220
+ value="๐ŸŒŒ Divine RP",
221
+ label="๐ŸŽฏ Preset",
222
+ interactive=True
223
+ )
224
+ with gr.Column(scale=1, min_width=150):
225
+ refresh_btn = gr.Button("๐Ÿ”„ Refresh Data", variant="secondary", size="lg")
226
+
227
+ # Filters Accordion
228
+ with gr.Accordion("โš™๏ธ Hardware & Filters", open=False):
229
+ with gr.Row():
230
+ param_min = gr.Slider(0, MAX_PARAMS_SLIDER, 0, step=1, label="Min Parameters (B)")
231
+ param_max = gr.Slider(0, MAX_PARAMS_SLIDER, MAX_PARAMS_SLIDER, step=1, label="Max Parameters (B)")
232
+ with gr.Row():
233
+ proprietary_check = gr.Checkbox(value=True, label="Include Proprietary (unknown params)")
234
+ moe_check = gr.Checkbox(value=False, label="MoE Only")
235
+ thinking_mode = gr.Radio(["Show All", "Hide Thinking", "Only Thinking"], value="Show All", label="Reasoning Models")
236
+ with gr.Row():
237
+ model_types = gr.CheckboxGroup(["Foundation", "Finetuned", "Merged"], value=["Foundation", "Finetuned", "Merged"], label="Model Types")
238
+ arch_dropdown = gr.Dropdown(["All"] + get_architecture_choices(initial_df), value="All", label="Architecture")
239
+ top_n_slider = gr.Slider(10, 500, DEFAULT_TOP_N, step=10, label="Top N")
240
+
241
+ # NEW BALANCE FILTER
242
+ with gr.Row():
243
+ balance_filter = gr.Radio(
244
+ choices=["Show All", "๐Ÿ’Ž Perfect (โ‰ฅ0.7)", "๐Ÿ… Good (โ‰ฅ0.5)", "โš–๏ธ Basic (โ‰ฅ0.3)"],
245
+ value="Show All",
246
+ label="๐Ÿ›ก๏ธ Robustness Filter (Objective Metrics Only)",
247
+ info="Filters out models with weak spots in 13 core metrics (Knowledge, Logic, Syntax)."
248
+ )
249
+
250
+ search_box = gr.Textbox(label="๐Ÿ” Search Models (name or architecture)", placeholder="e.g., Llama, Qwen, MistralForCausalLM...")
251
+ leaderboard_table = gr.Dataframe(datatype=["number", "markdown", "number", "str", "str", "str", "str"], wrap=True, interactive=False)
252
+
253
+ # Export
254
+ with gr.Row():
255
+ with gr.Column(scale=1):
256
+ export_btn = gr.Button("๐Ÿ“ฅ Export CSV", variant="primary", size="sm")
257
+ with gr.Column(scale=4):
258
+ export_file = gr.File(label="Download CSV", visible=False, height=50)
259
+
260
+ with gr.Tab("โš–๏ธ Compare"):
261
+ gr.Markdown("### Compare Multiple Models")
262
+ with gr.Row():
263
+ with gr.Column(scale=2):
264
+ search_compare = gr.Textbox(label="๐Ÿ” Search to Add Models", placeholder="Type model name...")
265
+ search_results_radio = gr.Radio(choices=[], label="Select from results", interactive=True)
266
+ add_model_btn = gr.Button("โž• Add Model", variant="secondary")
267
+ with gr.Column(scale=3):
268
+ compare_textbox = gr.Textbox(label="๐Ÿ“‹ Comparing (one per line)", lines=8, placeholder="Add models using search...")
269
+
270
+ compare_btn = gr.Button("๐Ÿ“Š Generate Comparison", variant="primary")
271
+ with gr.Row():
272
+ radar_plot = gr.Plot(label="๐Ÿ“ˆ Radar Chart")
273
+ compare_table = gr.Dataframe(label="๐Ÿ“Š Comparison Table")
274
+
275
+ with gr.Tab("๐ŸŽจ Custom Weights"):
276
+ gr.Markdown("### Create Your Own Preset")
277
+ gr.Markdown("Adjust weights for each metric (must sum to 1.0)")
278
+ with gr.Row():
279
+ with gr.Column():
280
+ w_textbook = gr.Slider(0, 1, 0.12, step=0.01, label="๐Ÿ“š Textbook Knowledge")
281
+ w_popculture = gr.Slider(0, 1, 0.08, step=0.01, label="๐ŸŽฌ Pop Culture")
282
+ w_worldmodel = gr.Slider(0, 1, 0.10, step=0.01, label="๐ŸŒ World Model")
283
+ w_instruction = gr.Slider(0, 1, 0.10, step=0.01, label="๐Ÿ“‹ Instruction Following")
284
+ w_style = gr.Slider(0, 1, 0.25, step=0.01, label="โœ๏ธ Writing Style")
285
+ with gr.Column():
286
+ w_originality = gr.Slider(0, 1, 0.10, step=0.01, label="โœจ Originality")
287
+ w_dialogue = gr.Slider(0, 1, 0.15, step=0.01, label="๐Ÿ’ฌ Dialogue Balance")
288
+ w_unbound = gr.Slider(0, 1, 0.05, step=0.01, label="๐Ÿ”“ Unbound")
289
+ w_redundancy = gr.Slider(0, 1, 0.05, step=0.01, label="๐Ÿงน Low Redundancy")
290
+ weight_sum_display = gr.Markdown("**Total Weight:** 1.00")
291
+ calc_custom_btn = gr.Button("๐ŸŽฏ Calculate Custom Score", variant="primary")
292
+ custom_results = gr.Dataframe(label="Top 50 Models")
293
+
294
+ with gr.Tab("๐Ÿ“– About"):
295
+ gr.Markdown(f"""
296
+ # ๐Ÿ† About UGI Leaderboard v3.6
297
+
298
+ ## ๐ŸŽฏ Presets Explained
299
+
300
+ ### ๐ŸŒŒ Divine RP
301
+ Perfect balance for roleplay and creative storytelling. Emphasizes writing style (25%), dialogue (15%), and world knowledge.
302
+
303
+ ### ๐ŸŒถ๏ธ Erotic Storyteller
304
+ Optimized for NSFW creative content. High unbound weight (30%), NSFW tone (15%).
305
+
306
+ ### ๐Ÿ’Ž Perfect Balance (NEW)
307
+ **Hybrid Score (Min ร— Geometric Mean).** Requires consistency across all objective metrics (Knowledge, Logic, Style, Structure). Rewards models that are "good at everything" and punishes those with even one weak spot.
308
+
309
+ ### โš–๏ธ No Weak Spots (NEW)
310
+ **Harmonic Mean.** Extremely strict. One failing metric (e.g., poor instruction following) will destroy the entire score, regardless of how good the other metrics are.
311
+
312
+ ### ๐Ÿค– T-800 Logic
313
+ Pure logic and knowledge. Prioritizes textbook (40%) and world model (35%).
314
+
315
+ ### โœ’๏ธ Literary Virtuoso
316
+ Literary quality above all. Writing style (35%), originality (30%), low redundancy (15%).
317
+
318
+ ### ๐ŸŽฒ Dungeon Master
319
+ World-building specialist. World model (30%), combined knowledge (30%).
320
+
321
+ ### ๐ŸŒ‘ Dark Novelist
322
+ Dark fiction specialist. Dark tone (25%), writing style (25%), hazardous (15%).
323
+
324
+ ### ๐Ÿงผ Anti-Slop
325
+ Maximum originality. Fights generic outputs with originality (45%) and redundancy penalties (35%).
326
+
327
+ ### ๐ŸŽฏ Concise Assistant
328
+ Direct and efficient. Instruction (35%), low redundancy (30%).
329
+
330
+ ### ๐ŸŽช Entertainment Savant
331
+ Pop culture expert. Pop culture (40%), entertainment (25%).
332
+
333
+ ### ๐Ÿ”ฌ Unfiltered Scholar
334
+ Uncensored knowledge. Textbook (30%), hazardous (25%), unbound (20%).
335
+
336
+ ### โšก Efficiency King
337
+ Best performance per parameter. Calculated as: `Divine RP Score / (Params ^ 0.4)`.
338
+
339
+ ---
340
+
341
+ ## ๐Ÿท๏ธ Badges Key
342
+
343
+ - **๐Ÿ†•** = **Fresh**: Tested within the last 7 days.
344
+ - **๐Ÿง ** = **Thinking**: Uses Chain-of-Thought (CoT) or reasoning tokens.
345
+ - **๐Ÿ”ž** = **NSFW**: High frequency of explicit content generation.
346
+ - **๐Ÿ“‰** = **Repetitive**: Detected repetition loops in outputs.
347
+ - **๐Ÿค** = **Pocket**: Efficient model with โ‰ค 10B parameters.
348
+ - **๐Ÿณ** = **Giant**: Massive model with โ‰ฅ 70B parameters.
349
+
350
+ ---
351
+
352
+ ## ๐Ÿ“Š Scoring System (v3.6 Updated)
353
+
354
+ ### 1. Weighted Average (Smart Handling)
355
+ Unlike previous versions that filled missing data with artificial values, **v3.6 uses dynamic re-weighting**.
356
+ - If a model lacks a specific metric (e.g., "Music Theory"), that metric is excluded from the calculation.
357
+ - The weights of the remaining metrics are scaled up proportionally to sum to 1.0.
358
+ - **Penalty:** If a model has data for less than **{int(INSUFFICIENT_DATA_THRESHOLD*100)}%** of the preset's total weight, the final score is multiplied by **{INSUFFICIENT_DATA_PENALTY}**.
359
+
360
+ ### 2. Robust Normalization
361
+ Metrics are normalized using the **5th and 95th percentiles** to ignore outliers.
362
+ - `Score = (Value - P5) / (P95 - P5)`
363
+ - This ensures that one extremely high-scoring model doesn't squash everyone else to zero.
364
+
365
+ ### 3. Composites
366
+ - **World Model**: Average of Cooking, GeoGuesser, Weight Estimation, and Music Theory.
367
+ - **Unbound**: Average of Direct Refusal (inverse), Entertainment, and Hazardous knowledge.
368
+ - **Redundancy**: Combination of Semantic and Lexical redundancy metrics.
369
+
370
+ ---
371
+
372
+ ## ๐Ÿš€ Technical Details
373
+ - **Framework**: Gradio 5.x + Pandas + Plotly
374
+ - **Caching**: Data is cached for {int(CACHE_DURATION/3600)} hours to speed up loading.
375
+ - **Filters**: You can now filter by specific model architectures and robustness levels.
376
+
377
+ *Last Updated: {loader.last_updated}*
378
+ """)
379
+
380
+ diag_btn = None
381
+ clear_btn = None
382
+
383
+ if SHOW_DIAGNOSTICS:
384
+ with gr.Tab("๐Ÿ› ๏ธ Diagnostics"):
385
+ with gr.Row():
386
+ diag_btn = gr.Button("๐Ÿงช Run Diagnostics", variant="primary")
387
+ clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear Cache & Reload Data", variant="stop")
388
+
389
+ cache_status = gr.Textbox(label="Status", lines=3, interactive=False)
390
+ diag_report = gr.Code(label="๐Ÿ“‹ Diagnostic Report", language="markdown")
391
+
392
+ with gr.Accordion("๐Ÿ” Anomalies", open=False):
393
+ anomalies_table = gr.Dataframe(label="Detected Anomalies")
394
+ with gr.Accordion("๐Ÿ“Š Statistics", open=False):
395
+ stats_table = gr.Dataframe(label="Normalization Statistics")
396
+
397
+ # === INTERACTIONS (BINDING) ===
398
+
399
+ filter_inputs = [df_state, preset_dropdown, search_box, param_min, param_max, proprietary_check,
400
+ moe_check, thinking_mode, model_types, arch_dropdown, top_n_slider, balance_filter]
401
+ filter_outputs = [leaderboard_table, filtered_raw_state]
402
+
403
+ for inp in filter_inputs[1:]:
404
+ inp.change(filter_leaderboard, inputs=filter_inputs, outputs=filter_outputs)
405
+
406
+ def refresh_handler():
407
+ global _CACHED_DF
408
+ _CACHED_DF = None
409
+ loader.clear_cache()
410
+ new_df = get_dataframe()
411
+ return new_df, gr.update(choices=["All"] + get_architecture_choices(new_df))
412
+
413
+ refresh_btn.click(refresh_handler, outputs=[df_state, arch_dropdown]).then(filter_leaderboard, inputs=filter_inputs, outputs=filter_outputs)
414
+
415
+ def export_handler(df):
416
+ if df is None or df.empty:
417
+ return gr.update(value=None, visible=False)
418
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', mode='w', encoding='utf-8')
419
+ df.to_csv(temp_file.name, index=False)
420
+ return gr.update(value=temp_file.name, visible=True)
421
+
422
+ export_btn.click(export_handler, inputs=[filtered_raw_state], outputs=[export_file])
423
+
424
+ # Compare logic
425
+ search_compare.change(lambda df, q: gr.update(choices=df[df['author/model_name'].str.contains(q, case=False, na=False)]['author/model_name'].head(10).tolist() if q and df is not None else []), inputs=[df_state, search_compare], outputs=[search_results_radio])
426
+ add_model_btn.click(lambda t, s: t + ("\n" if t else "") + s if s else t, inputs=[compare_textbox, search_results_radio], outputs=[compare_textbox])
427
+ compare_btn.click(compare_models, inputs=[df_state, compare_textbox], outputs=[radar_plot, compare_table])
428
+
429
+ # Custom Weights logic
430
+ weight_inputs = [w_textbook, w_popculture, w_worldmodel, w_instruction, w_style, w_originality, w_dialogue, w_unbound, w_redundancy]
431
+ for w in weight_inputs: w.change(lambda *args: f"**Total Weight:** {sum(args):.2f}", inputs=weight_inputs, outputs=[weight_sum_display])
432
+ calc_custom_btn.click(lambda *args: calculate_custom_score(get_dataframe(), {k: v for k, v in zip(['Textbook', 'Pop Culture', 'World Model', 'Instruction', 'Writing Style', 'Originality', 'Dialogue', 'Unbound', 'Redundancy'], args)}), inputs=weight_inputs, outputs=[custom_results])
433
+
434
+ if SHOW_DIAGNOSTICS and diag_btn and clear_btn:
435
+ diag_btn.click(run_diagnostics, inputs=[df_state], outputs=[diag_report, anomalies_table, stats_table])
436
+ clear_btn.click(clear_and_reload, outputs=[df_state, cache_status]).then(filter_leaderboard, inputs=filter_inputs, outputs=filter_outputs)
437
+
438
+ demo.load(filter_leaderboard, inputs=filter_inputs, outputs=filter_outputs)
439
+
440
+ if __name__ == "__main__":
441
+ demo.launch()
config.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration constants for UGI Leaderboard."""
2
+
3
+ # Caching
4
+ CSV_URL = "https://huggingface.co/spaces/DontPlanToEnd/UGI-Leaderboard/resolve/main/ugi-leaderboard-data.csv"
5
+ CACHE_FILE = "data_cache.parquet"
6
+ META_FILE = "meta.json"
7
+ CACHE_DURATION = 6 * 3600 # 6 hours
8
+
9
+ # Scoring Penalties
10
+ INSUFFICIENT_DATA_THRESHOLD = 0.70
11
+ INSUFFICIENT_DATA_PENALTY = 0.3
12
+ REPETITION_BASE = 0.85
13
+ THINKING_THRESHOLD = 5000
14
+ THINKING_PENALTY_POWER = 0.5
15
+
16
+ # Gaussian Targets
17
+ GAUSSIAN_DIALOGUE_TARGET = 0.38
18
+ GAUSSIAN_DIALOGUE_SIGMA = 0.15
19
+ GAUSSIAN_VERBNOUN_TARGET = 0.85
20
+ GAUSSIAN_VERBNOUN_SIGMA = 0.2
21
+
22
+ # Normalization
23
+ ROBUST_QUANTILE_LOW = 0.05
24
+ ROBUST_QUANTILE_HIGH = 0.95
25
+ MIN_STD_THRESHOLD = 1e-9
26
+
27
+ # UI Defaults
28
+ MAX_PARAMS_SLIDER = 500
29
+ DEFAULT_TOP_N = 50
30
+ FORCE_REFRESH_ON_STARTUP = True
31
+
32
+ # === DEV MODE ===
33
+ SHOW_DIAGNOSTICS = False
34
+
35
+ # Objective Metrics List for Balance/Robustness Presets
36
+ # FIXED: Removed 'gauss_VerbNoun' due to scale mismatch
37
+ BALANCE_METRICS_LIST = [
38
+ 'norm_Textbook', # Knowledge
39
+ 'norm_PopCulture', # Culture
40
+ 'norm_Recipe', # Logic
41
+ 'norm_Geo', # Geography
42
+ 'norm_Weight', # Physics
43
+ 'norm_Music', # Music
44
+ 'norm_Style', # Style
45
+ 'norm_Originality', # Originality
46
+ 'gauss_Dialogue', # Structure
47
+ 'norm_Instruction', # Precision
48
+ 'inv_Semantic', # Coherence
49
+ 'inv_Lexical' # Variety
50
+ ]
data_loader.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import os, time, json
4
+ from datetime import datetime, timedelta
5
+ from config import *
6
+
7
+ class DataLoader:
8
+ def __init__(self):
9
+ self.df, self.last_updated = None, "Unknown"
10
+
11
+ def load_data(self, force_refresh=False):
12
+ """ะ—ะฐะณั€ัƒะทะบะฐ ะดะฐะฝะฝั‹ั… ั ะฟะพะดะดะตั€ะถะบะพะน ะฟั€ะธะฝัƒะดะธั‚ะตะปัŒะฝะพะณะพ ะพะฑะฝะพะฒะปะตะฝะธั."""
13
+ if force_refresh or self._needs_update():
14
+ print("๐Ÿ”„ Cache expired or missing. Fetching fresh data...")
15
+ try:
16
+ self.df = self._process_data(pd.read_csv(CSV_URL, on_bad_lines='skip'))
17
+ self._save_cache()
18
+ print(f"โœ… Data processed. Rows: {len(self.df)}")
19
+ except Exception as e:
20
+ print(f"โš ๏ธ Error fetching data: {e}")
21
+ self.df = pd.read_parquet(CACHE_FILE) if os.path.exists(CACHE_FILE) else pd.DataFrame()
22
+ self._load_meta()
23
+ else:
24
+ print("โšก Loading from cache.")
25
+ self.df = pd.read_parquet(CACHE_FILE)
26
+ self._load_meta()
27
+ return self.df
28
+
29
+ def _needs_update(self):
30
+ """ะŸั€ะพะฒะตั€ะบะฐ ะฝะตะพะฑั…ะพะดะธะผะพัั‚ะธ ะพะฑะฝะพะฒะปะตะฝะธั ะบะตัˆะฐ."""
31
+ if not os.path.exists(CACHE_FILE) or not os.path.exists(META_FILE):
32
+ return True
33
+ try:
34
+ with open(META_FILE) as f:
35
+ return (time.time() - json.load(f).get('timestamp', 0)) > CACHE_DURATION
36
+ except:
37
+ return True
38
+
39
+ def clear_cache(self):
40
+ """ะŸั€ะธะฝัƒะดะธั‚ะตะปัŒะฝะฐั ะพั‡ะธัั‚ะบะฐ ะฒัะตั… ั„ะฐะนะปะพะฒ ะบะตัˆะฐ."""
41
+ deleted = []
42
+ for file in [CACHE_FILE, META_FILE]:
43
+ if os.path.exists(file):
44
+ try:
45
+ os.remove(file)
46
+ deleted.append(file)
47
+ except Exception as e:
48
+ print(f"โš ๏ธ Failed to delete {file}: {e}")
49
+ if deleted:
50
+ print(f"๐Ÿ—‘๏ธ Cleared cache: {', '.join(deleted)}")
51
+ return deleted
52
+
53
+ def _save_cache(self):
54
+ self.df.to_parquet(CACHE_FILE)
55
+ with open(META_FILE, 'w') as f:
56
+ json.dump({'timestamp': time.time()}, f)
57
+ self.last_updated = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M")
58
+
59
+ def _load_meta(self):
60
+ try:
61
+ with open(META_FILE) as f:
62
+ self.last_updated = datetime.fromtimestamp(json.load(f)['timestamp']).strftime("%Y-%m-%d %H:%M")
63
+ except:
64
+ pass
65
+
66
+ def _clean_column(self, series, scale=1.0):
67
+ """ะ‘ะตะทะพะฟะฐัะฝะฐั ะพั‡ะธัั‚ะบะฐ ะธ ะผะฐััˆั‚ะฐะฑะธั€ะพะฒะฐะฝะธะต ั‡ะธัะปะพะฒั‹ั… ะบะพะปะพะฝะพะบ."""
68
+ if pd.api.types.is_string_dtype(series) or series.dtype == 'object':
69
+ series = series.astype(str).str.rstrip('%')
70
+ series = pd.to_numeric(series, errors='coerce')
71
+ return series / scale if scale > 1 else series
72
+
73
+ def _get_model_type(self, row):
74
+ """ะžะฟั€ะตะดะตะปะตะฝะธะต ั‚ะธะฟะฐ ะผะพะดะตะปะธ ะดะปั ัะพั€ั‚ะธั€ะพะฒะบะธ."""
75
+ # Returns: (sort_value, short_code, full_name)
76
+ if pd.isna(row.get('Total Parameters')) or row.get('Total Parameters', 0) <= 0:
77
+ return (3, 'P', 'Proprietary')
78
+
79
+ is_foundation = row.get('Is Foundation', False)
80
+ is_merged = row.get('Is Merged', False)
81
+
82
+ if is_foundation and not is_merged:
83
+ return (0, 'B', 'Base')
84
+ if is_merged:
85
+ return (2, 'M', 'Merge')
86
+ if row.get('Is Finetuned', False) and not is_merged:
87
+ return (1, 'F', 'Finetune')
88
+
89
+ return (4, '', 'Unknown')
90
+
91
+ def _process_data(self, df):
92
+ """ะžัะฝะพะฒะฝะพะน ะฟะฐะนะฟะปะฐะนะฝ ะพะฑั€ะฐะฑะพั‚ะบะธ."""
93
+ print("โš™๏ธ Processing pipeline started...")
94
+ df.columns = df.columns.str.strip()
95
+
96
+ # === 1. COLUMN GROUPS ===
97
+ col_groups = {
98
+ 'percentage': (['Textbook', 'Pop Culture', 'Dialogue_Percentage', 'Verb_to_Noun_Ratio',
99
+ 'Show Rec Correlation', 'avg_length_error_pct'], 100.0),
100
+ 'already_norm': (['avg_writing_style_score', 'originality_score', 'internal_semantic_redundancy',
101
+ 'lexical_stuckness', 'wm_recipe_percent_error_score', 'wm_geoguessr_mae_score',
102
+ 'wm_weight_percent_error_score', 'wm_music_mae_score'], 1.0),
103
+ 'numeric': (['Total Parameters', 'Active Parameters', 'Repetition Interrupts', 'Avg Thinking Chars'], 1.0),
104
+ 'scale_10': (['avg_nsfw_score', 'avg_dark_score', 'Hazardous', 'Entertainment',
105
+ 'SocPol', 'W/10-Direct', 'W/10-Adherence'], 10.0)
106
+ }
107
+
108
+ for group, (cols, scale) in col_groups.items():
109
+ for col in cols:
110
+ if col in df.columns:
111
+ df[col] = self._clean_column(df[col], scale)
112
+ if group == 'already_norm':
113
+ df[col] = df[col].clip(0, 1.0)
114
+ else:
115
+ df[col] = np.nan
116
+
117
+ # === 2. BOOLEANS & STRINGS ===
118
+ if 'Is Thinking Model' in df.columns:
119
+ df['Is Thinking Model'] = (
120
+ df['Is Thinking Model'].astype(str).fillna('FALSE').str.strip().str.upper() == 'TRUE'
121
+ )
122
+ else:
123
+ df['Is Thinking Model'] = False
124
+
125
+ df['Architecture'] = df.get('Architecture', 'Unknown').fillna('Unknown').replace('null', 'Unknown')
126
+
127
+ # === 3. MODEL TYPES & DATES ===
128
+ type_data = df.apply(self._get_model_type, axis=1)
129
+ df['_type_sort'] = type_data.apply(lambda x: x[0])
130
+ df['Type_Code'] = type_data.apply(lambda x: x[1])
131
+ df['Type_Name'] = type_data.apply(lambda x: x[2])
132
+
133
+ if 'Test Date' in df.columns:
134
+ df['Test Date'] = pd.to_datetime(df['Test Date'], format='%m/%d/%Y', errors='coerce')
135
+ week_ago = datetime.now() - timedelta(days=7)
136
+ df['Is_New'] = df['Test Date'].apply(lambda x: True if pd.notna(x) and x >= week_ago else False)
137
+ df['Test Date'] = df['Test Date'].dt.strftime('%Y-%m-%d')
138
+ else:
139
+ df['Is_New'] = False
140
+
141
+ # === 4. PENALTIES ===
142
+ df['penalty_repetition'] = REPETITION_BASE ** df['Repetition Interrupts'].fillna(0)
143
+
144
+ chars = df['Avg Thinking Chars'].fillna(0)
145
+ df['penalty_thinking'] = np.where(
146
+ df['Is Thinking Model'] & (chars > THINKING_THRESHOLD),
147
+ np.power(THINKING_THRESHOLD / (chars + 1e-6), THINKING_PENALTY_POWER).clip(upper=1.0),
148
+ 1.0
149
+ )
150
+
151
+ # === 5. GAUSSIAN SCORES ===
152
+ df['gauss_Dialogue'] = self._gaussian_score(df['Dialogue_Percentage'], GAUSSIAN_DIALOGUE_TARGET, GAUSSIAN_DIALOGUE_SIGMA)
153
+ df['gauss_VerbNoun'] = self._gaussian_score(df['Verb_to_Noun_Ratio'], GAUSSIAN_VERBNOUN_TARGET, GAUSSIAN_VERBNOUN_SIGMA)
154
+
155
+ # === 6. NORMALIZATION ===
156
+ norm_config = {
157
+ # Direct normalization (Higher = Better)
158
+ 'norm_Textbook': ('Textbook', 'direct'),
159
+ 'norm_PopCulture': ('Pop Culture', 'direct'),
160
+ 'norm_ShowRec': ('Show Rec Correlation', 'direct'),
161
+ 'norm_Style': ('avg_writing_style_score', 'direct'),
162
+ 'norm_Originality': ('originality_score', 'direct'),
163
+ 'norm_NSFW': ('avg_nsfw_score', 'direct'),
164
+ 'norm_Dark': ('avg_dark_score', 'direct'),
165
+ 'norm_Hazardous': ('Hazardous', 'direct'),
166
+ 'norm_Entertainment': ('Entertainment', 'direct'),
167
+ 'norm_Instruction': ('W/10-Adherence', 'direct'),
168
+ 'norm_Unbound_Direct': ('W/10-Direct', 'direct'),
169
+ # World Model (Direct)
170
+ 'norm_Recipe': ('wm_recipe_percent_error_score', 'direct'),
171
+ 'norm_Geo': ('wm_geoguessr_mae_score', 'direct'),
172
+ 'norm_Weight': ('wm_weight_percent_error_score', 'direct'),
173
+ 'norm_Music': ('wm_music_mae_score', 'direct'),
174
+ # Inverse normalization (Higher = Worse)
175
+ 'inv_Semantic': ('internal_semantic_redundancy', 'inverse'),
176
+ 'inv_Lexical': ('lexical_stuckness', 'inverse'),
177
+ 'inv_LengthErr': ('avg_length_error_pct', 'inverse')
178
+ }
179
+
180
+ for dest, (src, mode) in norm_config.items():
181
+ if src in df.columns:
182
+ df[dest] = self._inverse_normalize(df[src]) if mode == 'inverse' else self._robust_normalize(df[src])
183
+ else:
184
+ df[dest] = np.nan
185
+
186
+ # === 7. COMPOSITES ===
187
+ composites = {
188
+ 'Composite_WorldModel': ['norm_Recipe', 'norm_Geo', 'norm_Weight', 'norm_Music'],
189
+ 'Composite_Unbound': ['norm_Unbound_Direct', 'norm_Entertainment', 'norm_Hazardous'],
190
+ 'Composite_Redundancy': ['inv_Semantic', 'inv_Lexical']
191
+ }
192
+ for comp, cols in composites.items():
193
+ df[comp] = df[cols].mean(axis=1, skipna=False)
194
+
195
+ # === 8. SMART NA FILLING (For Sorting) ===
196
+ print("๐Ÿ”ง Applying smart NA handling for sorting...")
197
+ higher_is_better = [
198
+ 'Show Rec Correlation', 'norm_Textbook', 'norm_PopCulture', 'norm_ShowRec',
199
+ 'norm_Style', 'norm_Originality', 'Composite_WorldModel', 'Composite_Unbound',
200
+ 'norm_Recipe', 'norm_Geo', 'norm_Weight', 'norm_Music'
201
+ ]
202
+ for col in higher_is_better:
203
+ if col in df.columns:
204
+ df[col] = df[col].fillna(-99999)
205
+
206
+ lower_is_better = [
207
+ 'avg_length_error_pct', 'internal_semantic_redundancy', 'lexical_stuckness',
208
+ 'inv_Semantic', 'inv_Lexical', 'inv_LengthErr'
209
+ ]
210
+ for col in lower_is_better:
211
+ if col in df.columns:
212
+ df[col] = df[col].fillna(99999)
213
+
214
+ print("โœ… Processing complete!")
215
+ return df
216
+
217
+ def _robust_normalize(self, series):
218
+ """Robust normalization with divide-by-zero protection."""
219
+ valid = series.dropna()
220
+ if valid.empty or valid.std() < MIN_STD_THRESHOLD:
221
+ return pd.Series(np.nan, index=series.index)
222
+ q05, q95 = valid.quantile(ROBUST_QUANTILE_LOW), valid.quantile(ROBUST_QUANTILE_HIGH)
223
+ denominator = q95 - q05
224
+ if abs(denominator) < MIN_STD_THRESHOLD:
225
+ return pd.Series(np.nan, index=series.index)
226
+ return (series.clip(q05, q95) - q05) / denominator
227
+
228
+ def _inverse_normalize(self, series):
229
+ """Inverse robust normalization."""
230
+ valid = series.dropna()
231
+ if valid.empty or valid.std() < MIN_STD_THRESHOLD:
232
+ return pd.Series(np.nan, index=series.index)
233
+ p5, p95 = valid.quantile(ROBUST_QUANTILE_LOW), valid.quantile(ROBUST_QUANTILE_HIGH)
234
+ denominator = p95 - p5
235
+ if abs(denominator) < MIN_STD_THRESHOLD:
236
+ return pd.Series(np.nan, index=series.index)
237
+ return (p95 - series.clip(p5, p95)) / denominator
238
+
239
+ def _gaussian_score(self, series, target, sigma):
240
+ return np.exp(-((series - target) ** 2) / (2 * sigma ** 2))
241
+
242
+ # Create instance
243
+ loader = DataLoader()
dev_tools.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datetime import datetime
4
+ from scoring import METRIC_MAP, PRESET_CONFIGS
5
+
6
+ class DevSuite:
7
+ def __init__(self, df):
8
+ self.df = df
9
+ self.report = {
10
+ "summary": {"critical": 0, "medium": 0, "low": 0, "tests_passed": 0},
11
+ "critical_issues": [], "medium_issues": [], "low_issues": [], "anomalies": [], "statistics": {}
12
+ }
13
+
14
+ def run_all_tests(self):
15
+ if self.df is None or self.df.empty:
16
+ self._add_issue("critical", "DataFrame is empty or None.")
17
+ return self._generate_markdown_report()
18
+
19
+ self._test_normalization_bounds()
20
+ self._test_parameter_scaling()
21
+ self._test_badges_logic()
22
+ self._test_weight_sums()
23
+ self._test_score_ranges()
24
+ self._collect_normalization_stats()
25
+
26
+ return self._generate_markdown_report()
27
+
28
+ def get_anomalies_df(self):
29
+ return pd.DataFrame(self.report["anomalies"]) if self.report["anomalies"] else pd.DataFrame()
30
+
31
+ def get_statistics_df(self):
32
+ return pd.DataFrame(self.report["statistics"]).T if self.report["statistics"] else pd.DataFrame()
33
+
34
+ def _test_normalization_bounds(self):
35
+ norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns]
36
+ for col in norm_cols:
37
+ # Ignore sentinel values
38
+ values = self.df[col].dropna()
39
+ values = values[values.abs() < 90000]
40
+ if values.empty: continue
41
+ if values.min() < -1e-6 or values.max() > 1.0 + 1e-6:
42
+ self._add_issue("critical", f"Normalization bounds broken in '{col}'")
43
+ else:
44
+ self.report["summary"]["tests_passed"] += 1
45
+
46
+ def _test_parameter_scaling(self):
47
+ if 'Total Parameters' in self.df.columns:
48
+ # Check a known big model
49
+ big_model = self.df[self.df['author/model_name'].str.contains("Llama-3.1-405B", case=False, na=False)]
50
+ if not big_model.empty and big_model.iloc[0]['Total Parameters'] < 400:
51
+ self._add_issue("critical", "Parameter scaling issue: 405B model appears small.")
52
+ else:
53
+ self.report["summary"]["tests_passed"] += 1
54
+
55
+ def _test_badges_logic(self):
56
+ if 'Badges' in self.df.columns:
57
+ pocket = self.df[self.df['Badges'].str.contains("๐Ÿค")]
58
+ if not pocket.empty and pocket['Total Parameters'].max() > 15:
59
+ self._add_issue("medium", "Pocket badge assigned to large model.")
60
+ else:
61
+ self.report["summary"]["tests_passed"] += 1
62
+
63
+ def _test_weight_sums(self):
64
+ for preset, weights in PRESET_CONFIGS.items():
65
+ if isinstance(weights, dict) and 'special_type' not in weights:
66
+ if abs(sum(weights.values()) - 1.0) > 1e-4:
67
+ self._add_issue("medium", f"Preset '{preset}' weights != 1.0")
68
+ else:
69
+ self.report["summary"]["tests_passed"] += 1
70
+
71
+ def _test_score_ranges(self):
72
+ score_cols = [c for c in self.df.columns if c.startswith("Score_")]
73
+ for col in score_cols:
74
+ if 'Efficiency' in col: continue
75
+ vals = self.df[col].dropna()
76
+ if not vals.empty and (vals.min() < 0 or vals.max() > 1.1):
77
+ self._add_issue("medium", f"Score out of range in {col}")
78
+ else:
79
+ self.report["summary"]["tests_passed"] += 1
80
+
81
+ def _collect_normalization_stats(self):
82
+ norm_cols = [v[0] for k, v in METRIC_MAP.items() if v[0] in self.df.columns]
83
+ for col in norm_cols:
84
+ values = self.df[col].dropna()
85
+ values = values[values.abs() < 90000]
86
+ self.report["statistics"][col] = {
87
+ "min": float(values.min()) if not values.empty else 0,
88
+ "max": float(values.max()) if not values.empty else 0,
89
+ "mean": float(values.mean()) if not values.empty else 0
90
+ }
91
+
92
+ def _add_issue(self, level, message):
93
+ self.report["summary"][level] += 1
94
+ self.report[f"{level}_issues"].append(message)
95
+
96
+ def _generate_markdown_report(self):
97
+ r = self.report
98
+ md = [f"## Executive Summary\n- Passed: {r['summary']['tests_passed']}\n- Critical: {r['summary']['critical']}"]
99
+ if r['critical_issues']:
100
+ md.append("### Critical Issues")
101
+ md.extend([f"- {i}" for i in r['critical_issues']])
102
+ return "\n".join(md)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=5.0.0
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+ pyarrow>=12.0.0
5
+ requests>=2.0.0
6
+ plotly>=5.0.0
scoring.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from config import *
4
+
5
+ METRIC_MAP = {
6
+ 'Textbook': ('norm_Textbook', 'Textbook'),
7
+ 'Pop Culture': ('norm_PopCulture', 'Pop Culture'),
8
+ 'World Model': ('Composite_WorldModel', 'Composite_WorldModel'),
9
+ 'Instruction': ('norm_Instruction', 'W/10-Adherence'),
10
+ 'Writing Style': ('norm_Style', 'avg_writing_style_score'),
11
+ 'Originality': ('norm_Originality', 'originality_score'),
12
+ 'Dialogue': ('gauss_Dialogue', 'Dialogue_Percentage'),
13
+ 'Unbound': ('Composite_Unbound', 'Composite_Unbound'),
14
+ 'NSFW Tone': ('norm_NSFW', 'avg_nsfw_score'),
15
+ 'Dark Tone': ('norm_Dark', 'avg_dark_score'),
16
+ 'Redundancy': ('Composite_Redundancy', 'Composite_Redundancy'),
17
+ 'Hazardous': ('norm_Hazardous', 'Hazardous'),
18
+ 'Entertainment': ('norm_Entertainment', 'Entertainment'),
19
+ 'Length Acc': ('inv_LengthErr', 'avg_length_error_pct'),
20
+ 'VerbNoun': ('gauss_VerbNoun', 'Verb_to_Noun_Ratio')
21
+ }
22
+
23
+ PRESET_CONFIGS = {
24
+ "๐ŸŒŒ Divine RP": {
25
+ 'Textbook': 0.12, 'Pop Culture': 0.08, 'World Model': 0.10,
26
+ 'Instruction': 0.10, 'Writing Style': 0.25, 'Originality': 0.10,
27
+ 'Dialogue': 0.15, 'Unbound': 0.05, 'Redundancy': 0.05
28
+ },
29
+ "๐ŸŒถ๏ธ Erotic Storyteller": {
30
+ 'World Model': 0.10, 'Instruction': 0.05, 'Writing Style': 0.15,
31
+ 'Originality': 0.05, 'Dialogue': 0.15, 'Unbound': 0.30,
32
+ 'NSFW Tone': 0.15, 'Redundancy': 0.05
33
+ },
34
+ "๐Ÿค– T-800 Logic": {
35
+ 'Textbook': 0.40, 'World Model': 0.35, 'Instruction': 0.20, 'Redundancy': 0.05
36
+ },
37
+ "โœ’๏ธ Literary Virtuoso": {
38
+ 'Writing Style': 0.35, 'Originality': 0.30, 'Redundancy': 0.15,
39
+ 'Instruction': 0.10, 'Dialogue': 0.10
40
+ },
41
+ "๐ŸŽฒ Dungeon Master": {
42
+ 'World Model': 0.30, 'Textbook': 0.15, 'Pop Culture': 0.15,
43
+ 'Instruction': 0.20, 'Originality': 0.10, 'Dialogue': 0.10
44
+ },
45
+ "๐ŸŒ‘ Dark Novelist": {
46
+ 'Dark Tone': 0.25, 'Writing Style': 0.25, 'Hazardous': 0.15,
47
+ 'Originality': 0.20, 'Unbound': 0.15
48
+ },
49
+ "๐Ÿงผ Anti-Slop": {
50
+ 'Originality': 0.45, 'Redundancy': 0.35, 'Writing Style': 0.10, 'Instruction': 0.10
51
+ },
52
+ "๐ŸŽฏ Concise Assistant": {
53
+ 'Instruction': 0.35, 'Redundancy': 0.30, 'Textbook': 0.20,
54
+ 'World Model': 0.10, 'Dialogue': 0.05
55
+ },
56
+ "๐ŸŽช Entertainment Savant": {
57
+ 'Pop Culture': 0.40, 'Entertainment': 0.25, 'Instruction': 0.15,
58
+ 'Writing Style': 0.10, 'Dialogue': 0.10
59
+ },
60
+ "๐Ÿ”ฌ Unfiltered Scholar": {
61
+ 'Textbook': 0.30, 'Hazardous': 0.25, 'Unbound': 0.20,
62
+ 'Instruction': 0.15, 'Originality': 0.05, 'Redundancy': 0.05
63
+ },
64
+ # === BALANCE PRESETS ===
65
+ "๐Ÿ’Ž Perfect Balance": {
66
+ 'special_type': 'balanced',
67
+ 'metrics': BALANCE_METRICS_LIST
68
+ },
69
+ "โš–๏ธ No Weak Spots": {
70
+ 'special_type': 'harmonic',
71
+ 'metrics': BALANCE_METRICS_LIST
72
+ }
73
+ }
74
+
75
+ class ScoringEngine:
76
+ def __init__(self, df):
77
+ self.df = df.copy()
78
+
79
+ def calculate_all(self):
80
+ if self.df.empty:
81
+ return self.df
82
+
83
+ print("๐Ÿงฎ Calculating scores...")
84
+
85
+ for preset_name, config in PRESET_CONFIGS.items():
86
+ col_name = f"Score_{preset_name}"
87
+
88
+ if isinstance(config, dict) and 'special_type' in config:
89
+ if config['special_type'] == 'balanced':
90
+ self.df[col_name] = self._calculate_balanced_score(config['metrics'])
91
+ elif config['special_type'] == 'harmonic':
92
+ self.df[col_name] = self._calculate_harmonic_score(config['metrics'])
93
+ else:
94
+ self.df[col_name] = self.calculate_weighted_score(config)
95
+
96
+ # Efficiency King
97
+ params = self.df['Total Parameters'].fillna(0).replace(0, 9999)
98
+ base_score = self.df.get("Score_๐ŸŒŒ Divine RP", 0)
99
+ divisor = np.power(params, 0.4)
100
+ self.df["Score_โšก Efficiency King"] = (base_score / divisor) * 10
101
+ self.df["Score_โšก Efficiency King"] = self.df["Score_โšก Efficiency King"].fillna(0)
102
+
103
+ self.df["Score_๐Ÿค Pocket Genius"] = self.df.get("Score_๐ŸŒŒ Divine RP", 0)
104
+
105
+ self._generate_badges_vectorized()
106
+
107
+ # Round scores
108
+ score_cols = [c for c in self.df.columns if c.startswith("Score_")]
109
+ self.df[score_cols] = self.df[score_cols].round(3)
110
+
111
+ return self.df
112
+
113
+ def calculate_weighted_score(self, weights_dict):
114
+ """Standard weighted average logic ignoring sentinel values."""
115
+ weighted_sum = pd.Series(0.0, index=self.df.index)
116
+ total_valid_weight = pd.Series(0.0, index=self.df.index)
117
+ total_preset_weight = sum(weights_dict.values())
118
+
119
+ for key, weight in weights_dict.items():
120
+ if key not in METRIC_MAP: continue
121
+ norm_col, _ = METRIC_MAP[key]
122
+ if norm_col not in self.df.columns: continue
123
+
124
+ values = self.df[norm_col]
125
+ mask = values.notna() & (values.abs() < 90000)
126
+
127
+ weighted_sum[mask] += values[mask] * weight
128
+ total_valid_weight[mask] += weight
129
+
130
+ final_score = weighted_sum / total_valid_weight.replace(0, np.nan)
131
+ final_score = final_score.fillna(0.0)
132
+
133
+ # Penalties
134
+ valid_weight_ratio = total_valid_weight / total_preset_weight
135
+ insufficient_mask = valid_weight_ratio < INSUFFICIENT_DATA_THRESHOLD
136
+ final_score[insufficient_mask] *= INSUFFICIENT_DATA_PENALTY
137
+
138
+ self._apply_global_penalties(final_score)
139
+ return final_score
140
+
141
+ def _calculate_balanced_score(self, metric_keys):
142
+ """Hybrid: sqrt(min) * sqrt(geometric_mean)."""
143
+ return self._calculate_special_score(metric_keys, method='hybrid')
144
+
145
+ def _calculate_harmonic_score(self, metric_keys):
146
+ """Harmonic Mean."""
147
+ return self._calculate_special_score(metric_keys, method='harmonic')
148
+
149
+ def _calculate_special_score(self, metric_keys, method):
150
+ cols_to_use = [col for col in metric_keys if col in self.df.columns]
151
+ if not cols_to_use:
152
+ return pd.Series(0.0, index=self.df.index)
153
+
154
+ subset = self.df[cols_to_use].copy()
155
+
156
+ # Filter out sentinel values
157
+ for col in subset.columns:
158
+ subset[col] = subset[col].where(subset[col].abs() < 90000)
159
+
160
+ # FIXED: Soft NaN handling ( (0.3 + median)/2 )
161
+ for col in subset.columns:
162
+ col_median = subset[col].median()
163
+ if pd.isna(col_median) or col_median <= 0:
164
+ fill_val = 0.3
165
+ else:
166
+ fill_val = (0.3 + col_median) / 2
167
+ subset[col] = subset[col].fillna(fill_val)
168
+
169
+ # FIXED: Softer clip (0.1 instead of 0.01)
170
+ subset = subset.clip(lower=0.1, upper=1.0)
171
+
172
+ if method == 'hybrid':
173
+ min_score = subset.min(axis=1)
174
+ log_mean = np.log(subset).mean(axis=1)
175
+ geom_score = np.exp(log_mean)
176
+ final_score = np.sqrt(min_score) * np.sqrt(geom_score)
177
+
178
+ elif method == 'harmonic':
179
+ n = len(cols_to_use)
180
+ sum_inverse = (1.0 / subset).sum(axis=1)
181
+ final_score = n / sum_inverse
182
+
183
+ self._apply_global_penalties(final_score)
184
+ return final_score
185
+
186
+ def _apply_global_penalties(self, score_series):
187
+ if 'penalty_repetition' in self.df.columns:
188
+ score_series *= self.df['penalty_repetition'].fillna(1.0)
189
+ if 'penalty_thinking' in self.df.columns:
190
+ score_series *= self.df['penalty_thinking'].fillna(1.0)
191
+
192
+ def _generate_badges_vectorized(self):
193
+ badges = pd.Series("", index=self.df.index)
194
+ if 'Is_New' in self.df: badges += np.where(self.df['Is_New'], "๐Ÿ†• ", "")
195
+ if 'Is Thinking Model' in self.df: badges += np.where(self.df['Is Thinking Model'], "๐Ÿง  ", "")
196
+ if 'norm_NSFW' in self.df: badges += np.where((self.df['norm_NSFW'] > 0.5) & (self.df['norm_NSFW'] < 90000), "๐Ÿ”ž ", "")
197
+ if 'Repetition Interrupts' in self.df: badges += np.where(self.df['Repetition Interrupts'] >= 1.0, "๐Ÿ“‰ ", "")
198
+
199
+ params = self.df.get('Total Parameters', 999).fillna(999)
200
+ badges += np.where((params > 0) & (params <= 10), "๐Ÿค ", "")
201
+ badges += np.where(params >= 70, "๐Ÿณ ", "")
202
+
203
+ self.df['Badges'] = badges.str.strip()