File size: 9,295 Bytes
4b424d6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | import pandas as pd
import numpy as np
import shutil
import json
import math
from pathlib import Path
# Lazy import to avoid startup overhead
try:
from autogluon.tabular import TabularPredictor
AUTOGLUON_AVAILABLE = True
except ImportError:
AUTOGLUON_AVAILABLE = False
DATA_AI = Path("data/dataset.csv")
DATA_MANUAL = Path("data/manual_dataset.csv")
def sanitize_for_json(obj):
"""Recursively clean floats for JSON output."""
if isinstance(obj, float):
if math.isnan(obj) or math.isinf(obj): return None
return obj
elif isinstance(obj, dict):
return {k: sanitize_for_json(v) for k, v in obj.items()}
elif isinstance(obj, list):
return[sanitize_for_json(v) for v in obj]
return obj
def calculate_tag_accuracy(tags_ai, tags_man):
if pd.isna(tags_ai): tags_ai = ""
if pd.isna(tags_man): tags_man = ""
set_ai = set([t.strip().lower() for t in str(tags_ai).split(',') if t.strip()])
set_man = set([t.strip().lower() for t in str(tags_man).split(',') if t.strip()])
if not set_man and not set_ai: return 1.0
if not set_man or not set_ai: return 0.0
# Jaccard Similarity
return len(set_ai.intersection(set_man)) / len(set_ai.union(set_man))
def get_combined_dataset():
"""
Joins AI predictions with Manual Ground Truth on ID and calculates comprehensive vector differences.
"""
if not DATA_AI.exists() or not DATA_MANUAL.exists():
return None
try:
# Load datasets
df_ai = pd.read_csv(DATA_AI)
df_manual = pd.read_csv(DATA_MANUAL)
# Normalize IDs (Trim spaces, ensure string)
df_ai['id'] = df_ai['id'].astype(str).str.strip()
df_manual['id'] = df_manual['id'].astype(str).str.strip()
df_manual_cols =['id', 'final_veracity_score', 'visual_integrity_score', 'audio_integrity_score', 'source_credibility_score', 'logical_consistency_score', 'emotional_manipulation_score', 'video_audio_score', 'video_caption_score', 'audio_caption_score', 'tags', 'classification']
# Merge on ID
merged = pd.merge(
df_ai,
df_manual[[c for c in df_manual_cols if c in df_manual.columns]],
on='id',
suffixes=('_ai', '_manual'),
how='inner'
)
# 1. Final Score Error
merged['final_veracity_score_ai'] = pd.to_numeric(merged['final_veracity_score_ai'], errors='coerce').fillna(0)
merged['final_veracity_score_manual'] = pd.to_numeric(merged['final_veracity_score_manual'], errors='coerce').fillna(0)
merged['abs_error'] = (merged['final_veracity_score_ai'] - merged['final_veracity_score_manual']).abs()
# 2. Sophisticated Vector Calculations
vector_pairs =[
('visual_score', 'visual_integrity_score'),
('audio_score', 'audio_integrity_score'),
('source_score', 'source_credibility_score'),
('logic_score', 'logical_consistency_score'),
('emotion_score', 'emotional_manipulation_score'),
('align_video_audio', 'video_audio_score'),
('align_video_caption', 'video_caption_score'),
('align_audio_caption', 'audio_caption_score'),
]
error_cols = ['abs_error']
for ai_c, man_c in vector_pairs:
if ai_c in merged.columns and man_c in merged.columns:
# Multiply 1-10 scores by 10 to put them on the same 0-100 scale as final score
merged[ai_c] = pd.to_numeric(merged[ai_c], errors='coerce').fillna(5) * 10
merged[man_c] = pd.to_numeric(merged[man_c], errors='coerce').fillna(5) * 10
err_c = f"err_{ai_c}"
merged[err_c] = (merged[ai_c] - merged[man_c]).abs()
error_cols.append(err_c)
# Composite MAE represents the mean absolute error across the final score AND all 8 sub-vectors
merged['composite_mae'] = merged[error_cols].mean(axis=1)
# 3. Tag Accuracy Calculation
merged['tag_accuracy'] = merged.apply(lambda row: calculate_tag_accuracy(row.get('tags_ai', ''), row.get('tags_manual', '')), axis=1)
return merged
except Exception as e:
print(f"Error merging datasets: {e}")
return None
def format_config_params(params_raw):
"""Parses the config_params JSON string into a readable format for the leaderboard."""
if pd.isna(params_raw) or not params_raw:
return "Defaults"
try:
if isinstance(params_raw, str):
p = json.loads(params_raw)
else:
p = params_raw
reprompts = p.get('reprompts', 0)
comments = "Yes" if p.get('include_comments') == 'true' or p.get('include_comments') is True else "No"
return f"Retries:{reprompts} | Context:{comments}"
except:
return "Legacy/Unknown"
def calculate_benchmarks():
"""Global stats (All AI models vs Ground Truth)."""
merged = get_combined_dataset()
if merged is None or len(merged) == 0:
return {"status": "no_data"}
mae = merged['composite_mae'].mean()
tag_acc = merged['tag_accuracy'].mean()
# Binary Accuracy (Threshold 50)
merged['bin_ai'] = merged['final_veracity_score_ai'] >= 50
merged['bin_manual'] = merged['final_veracity_score_manual'] >= 50
accuracy = (merged['bin_ai'] == merged['bin_manual']).mean()
recent_samples = merged.tail(5)[['id', 'composite_mae', 'final_veracity_score_ai', 'final_veracity_score_manual']].to_dict(orient='records')
result = {
"count": int(len(merged)),
"mae": round(mae, 2), # Exposing composite MAE as main MAE metric
"accuracy_percent": round(accuracy * 100, 1),
"tag_accuracy_percent": round(tag_acc * 100, 1),
"recent_samples": recent_samples
}
return sanitize_for_json(result)
def generate_leaderboard():
"""
Groups results by Configuration to rank models/prompts using sophisticated distance measurements.
"""
merged = get_combined_dataset()
if merged is None or len(merged) == 0:
return []
for col in['config_model', 'config_prompt', 'config_reasoning', 'config_params']:
if col not in merged.columns: merged[col] = "Unknown"
merged = merged.fillna({'config_model': 'Unknown', 'config_prompt': 'Standard', 'config_reasoning': 'None'})
merged['params_readable'] = merged['config_params'].apply(format_config_params)
def extract_tools(p_raw):
try:
if isinstance(p_raw, str): p = json.loads(p_raw)
else: p = p_raw
if not isinstance(p, dict): return "None"
tools =[]
if p.get('agent_active'): tools.append("Agent")
if p.get('use_search'): tools.append("Search")
if p.get('use_code'): tools.append("Code")
if p.get('few_shot') or p.get('multi_shot'): tools.append("Few-Shot")
return ", ".join(tools) if tools else "None"
except:
return "None"
merged['tools'] = merged['config_params'].apply(extract_tools)
merged['bin_ai'] = merged['final_veracity_score_ai'] >= 50
merged['bin_manual'] = merged['final_veracity_score_manual'] >= 50
merged['is_correct'] = (merged['bin_ai'] == merged['bin_manual']).astype(int)
def get_fcot_depth(row):
r = str(row['config_reasoning']).lower()
if 'fcot' in r: return 2
elif 'cot' in r: return 1
return 0
merged['fcot_depth'] = merged.apply(get_fcot_depth, axis=1)
agg_dict = {
'comp_mae': ('composite_mae', 'mean'),
'tag_accuracy': ('tag_accuracy', 'mean'),
'accuracy': ('is_correct', 'mean'),
'count': ('id', 'count')
}
err_cols =[
'err_visual_score', 'err_audio_score', 'err_source_score',
'err_logic_score', 'err_emotion_score', 'err_align_video_audio',
'err_align_video_caption', 'err_align_audio_caption'
]
for col in err_cols:
if col in merged.columns:
agg_dict[col] = (col, 'mean')
# Group By Configuration using Composite MAE and Tag Accuracy
grouped = merged.groupby(['config_model', 'config_prompt', 'config_reasoning', 'params_readable', 'tools', 'fcot_depth']).agg(**agg_dict).reset_index()
leaderboard =[]
for _, row in grouped.iterrows():
entry = {
"type": "GenAI",
"model": row['config_model'],
"prompt": row['config_prompt'],
"reasoning": row['config_reasoning'],
"params": row['params_readable'],
"tools": row['tools'],
"fcot_depth": int(row['fcot_depth']),
"comp_mae": round(row['comp_mae'], 2),
"tag_acc": round(row['tag_accuracy'] * 100, 1),
"accuracy": round(row['accuracy'] * 100, 1),
"samples": int(row['count'])
}
for col in err_cols:
if col in row:
entry[col] = round(row[col], 2)
leaderboard.append(entry)
# Sort: Highest Accuracy, Highest Tag Accuracy, then Lowest Composite MAE
leaderboard.sort(key=lambda x: (-x['accuracy'], -x['tag_acc'], x['comp_mae']))
return sanitize_for_json(leaderboard)
|