Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import shutil | |
| import json | |
| import math | |
| from pathlib import Path | |
| # Lazy import to avoid startup overhead | |
| try: | |
| from autogluon.tabular import TabularPredictor | |
| AUTOGLUON_AVAILABLE = True | |
| except ImportError: | |
| AUTOGLUON_AVAILABLE = False | |
| DATA_AI = Path("data/dataset.csv") | |
| DATA_MANUAL = Path("data/manual_dataset.csv") | |
| def sanitize_for_json(obj): | |
| """Recursively clean floats for JSON output.""" | |
| if isinstance(obj, float): | |
| if math.isnan(obj) or math.isinf(obj): return None | |
| return obj | |
| elif isinstance(obj, dict): | |
| return {k: sanitize_for_json(v) for k, v in obj.items()} | |
| elif isinstance(obj, list): | |
| return[sanitize_for_json(v) for v in obj] | |
| return obj | |
| def calculate_tag_accuracy(tags_ai, tags_man): | |
| if pd.isna(tags_ai): tags_ai = "" | |
| if pd.isna(tags_man): tags_man = "" | |
| set_ai = set([t.strip().lower() for t in str(tags_ai).split(',') if t.strip()]) | |
| set_man = set([t.strip().lower() for t in str(tags_man).split(',') if t.strip()]) | |
| if not set_man and not set_ai: return 1.0 | |
| if not set_man or not set_ai: return 0.0 | |
| # Jaccard Similarity | |
| return len(set_ai.intersection(set_man)) / len(set_ai.union(set_man)) | |
| def get_combined_dataset(): | |
| """ | |
| Joins AI predictions with Manual Ground Truth on ID and calculates comprehensive vector differences. | |
| """ | |
| if not DATA_AI.exists() or not DATA_MANUAL.exists(): | |
| return None | |
| try: | |
| # Load datasets | |
| df_ai = pd.read_csv(DATA_AI) | |
| df_manual = pd.read_csv(DATA_MANUAL) | |
| # Normalize IDs (Trim spaces, ensure string) | |
| df_ai['id'] = df_ai['id'].astype(str).str.strip() | |
| df_manual['id'] = df_manual['id'].astype(str).str.strip() | |
| df_manual_cols =['id', 'final_veracity_score', 'visual_integrity_score', 'audio_integrity_score', 'source_credibility_score', 'logical_consistency_score', 'emotional_manipulation_score', 'video_audio_score', 'video_caption_score', 'audio_caption_score', 'tags', 'classification'] | |
| # Merge on ID | |
| merged = pd.merge( | |
| df_ai, | |
| df_manual[[c for c in df_manual_cols if c in df_manual.columns]], | |
| on='id', | |
| suffixes=('_ai', '_manual'), | |
| how='inner' | |
| ) | |
| # 1. Final Score Error | |
| merged['final_veracity_score_ai'] = pd.to_numeric(merged['final_veracity_score_ai'], errors='coerce').fillna(0) | |
| merged['final_veracity_score_manual'] = pd.to_numeric(merged['final_veracity_score_manual'], errors='coerce').fillna(0) | |
| merged['abs_error'] = (merged['final_veracity_score_ai'] - merged['final_veracity_score_manual']).abs() | |
| # 2. Sophisticated Vector Calculations | |
| vector_pairs =[ | |
| ('visual_score', 'visual_integrity_score'), | |
| ('audio_score', 'audio_integrity_score'), | |
| ('source_score', 'source_credibility_score'), | |
| ('logic_score', 'logical_consistency_score'), | |
| ('emotion_score', 'emotional_manipulation_score'), | |
| ('align_video_audio', 'video_audio_score'), | |
| ('align_video_caption', 'video_caption_score'), | |
| ('align_audio_caption', 'audio_caption_score'), | |
| ] | |
| error_cols = ['abs_error'] | |
| for ai_c, man_c in vector_pairs: | |
| if ai_c in merged.columns and man_c in merged.columns: | |
| # Multiply 1-10 scores by 10 to put them on the same 0-100 scale as final score | |
| merged[ai_c] = pd.to_numeric(merged[ai_c], errors='coerce').fillna(5) * 10 | |
| merged[man_c] = pd.to_numeric(merged[man_c], errors='coerce').fillna(5) * 10 | |
| err_c = f"err_{ai_c}" | |
| merged[err_c] = (merged[ai_c] - merged[man_c]).abs() | |
| error_cols.append(err_c) | |
| # Composite MAE represents the mean absolute error across the final score AND all 8 sub-vectors | |
| merged['composite_mae'] = merged[error_cols].mean(axis=1) | |
| # 3. Tag Accuracy Calculation | |
| merged['tag_accuracy'] = merged.apply(lambda row: calculate_tag_accuracy(row.get('tags_ai', ''), row.get('tags_manual', '')), axis=1) | |
| return merged | |
| except Exception as e: | |
| print(f"Error merging datasets: {e}") | |
| return None | |
| def format_config_params(params_raw): | |
| """Parses the config_params JSON string into a readable format for the leaderboard.""" | |
| if pd.isna(params_raw) or not params_raw: | |
| return "Defaults" | |
| try: | |
| if isinstance(params_raw, str): | |
| p = json.loads(params_raw) | |
| else: | |
| p = params_raw | |
| reprompts = p.get('reprompts', 0) | |
| comments = "Yes" if p.get('include_comments') == 'true' or p.get('include_comments') is True else "No" | |
| return f"Retries:{reprompts} | Context:{comments}" | |
| except: | |
| return "Legacy/Unknown" | |
| def calculate_benchmarks(): | |
| """Global stats (All AI models vs Ground Truth).""" | |
| merged = get_combined_dataset() | |
| if merged is None or len(merged) == 0: | |
| return {"status": "no_data"} | |
| mae = merged['composite_mae'].mean() | |
| tag_acc = merged['tag_accuracy'].mean() | |
| # Binary Accuracy (Threshold 50) | |
| merged['bin_ai'] = merged['final_veracity_score_ai'] >= 50 | |
| merged['bin_manual'] = merged['final_veracity_score_manual'] >= 50 | |
| accuracy = (merged['bin_ai'] == merged['bin_manual']).mean() | |
| recent_samples = merged.tail(5)[['id', 'composite_mae', 'final_veracity_score_ai', 'final_veracity_score_manual']].to_dict(orient='records') | |
| result = { | |
| "count": int(len(merged)), | |
| "mae": round(mae, 2), # Exposing composite MAE as main MAE metric | |
| "accuracy_percent": round(accuracy * 100, 1), | |
| "tag_accuracy_percent": round(tag_acc * 100, 1), | |
| "recent_samples": recent_samples | |
| } | |
| return sanitize_for_json(result) | |
| def generate_leaderboard(): | |
| """ | |
| Groups results by Configuration to rank models/prompts using sophisticated distance measurements. | |
| """ | |
| merged = get_combined_dataset() | |
| if merged is None or len(merged) == 0: | |
| return [] | |
| for col in['config_model', 'config_prompt', 'config_reasoning', 'config_params']: | |
| if col not in merged.columns: merged[col] = "Unknown" | |
| merged = merged.fillna({'config_model': 'Unknown', 'config_prompt': 'Standard', 'config_reasoning': 'None'}) | |
| merged['params_readable'] = merged['config_params'].apply(format_config_params) | |
| def extract_tools(p_raw): | |
| try: | |
| if isinstance(p_raw, str): p = json.loads(p_raw) | |
| else: p = p_raw | |
| if not isinstance(p, dict): return "None" | |
| tools =[] | |
| if p.get('agent_active'): tools.append("Agent") | |
| if p.get('use_search'): tools.append("Search") | |
| if p.get('use_code'): tools.append("Code") | |
| if p.get('few_shot') or p.get('multi_shot'): tools.append("Few-Shot") | |
| return ", ".join(tools) if tools else "None" | |
| except: | |
| return "None" | |
| merged['tools'] = merged['config_params'].apply(extract_tools) | |
| merged['bin_ai'] = merged['final_veracity_score_ai'] >= 50 | |
| merged['bin_manual'] = merged['final_veracity_score_manual'] >= 50 | |
| merged['is_correct'] = (merged['bin_ai'] == merged['bin_manual']).astype(int) | |
| def get_fcot_depth(row): | |
| r = str(row['config_reasoning']).lower() | |
| if 'fcot' in r: return 2 | |
| elif 'cot' in r: return 1 | |
| return 0 | |
| merged['fcot_depth'] = merged.apply(get_fcot_depth, axis=1) | |
| agg_dict = { | |
| 'comp_mae': ('composite_mae', 'mean'), | |
| 'tag_accuracy': ('tag_accuracy', 'mean'), | |
| 'accuracy': ('is_correct', 'mean'), | |
| 'count': ('id', 'count') | |
| } | |
| err_cols =[ | |
| 'err_visual_score', 'err_audio_score', 'err_source_score', | |
| 'err_logic_score', 'err_emotion_score', 'err_align_video_audio', | |
| 'err_align_video_caption', 'err_align_audio_caption' | |
| ] | |
| for col in err_cols: | |
| if col in merged.columns: | |
| agg_dict[col] = (col, 'mean') | |
| # Group By Configuration using Composite MAE and Tag Accuracy | |
| grouped = merged.groupby(['config_model', 'config_prompt', 'config_reasoning', 'params_readable', 'tools', 'fcot_depth']).agg(**agg_dict).reset_index() | |
| leaderboard =[] | |
| for _, row in grouped.iterrows(): | |
| entry = { | |
| "type": "GenAI", | |
| "model": row['config_model'], | |
| "prompt": row['config_prompt'], | |
| "reasoning": row['config_reasoning'], | |
| "params": row['params_readable'], | |
| "tools": row['tools'], | |
| "fcot_depth": int(row['fcot_depth']), | |
| "comp_mae": round(row['comp_mae'], 2), | |
| "tag_acc": round(row['tag_accuracy'] * 100, 1), | |
| "accuracy": round(row['accuracy'] * 100, 1), | |
| "samples": int(row['count']) | |
| } | |
| for col in err_cols: | |
| if col in row: | |
| entry[col] = round(row[col], 2) | |
| leaderboard.append(entry) | |
| # Sort: Highest Accuracy, Highest Tag Accuracy, then Lowest Composite MAE | |
| leaderboard.sort(key=lambda x: (-x['accuracy'], -x['tag_acc'], x['comp_mae'])) | |
| return sanitize_for_json(leaderboard) | |