Sarthak
commited on
Commit
·
7b072e5
1
Parent(s):
aaaf803
chore: remove MTEB results analysis script
Browse filesThis commit removes the script for analyzing MTEB results. The script was responsible for processing benchmark results, categorizing tasks, calculating averages, and updating the README.md file with a results table. The script is no longer needed.
- analyze_mteb_results.py +0 -311
analyze_mteb_results.py
DELETED
|
@@ -1,311 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python
|
| 2 |
-
"""
|
| 3 |
-
MTEB Results Analysis Script.
|
| 4 |
-
|
| 5 |
-
This script analyzes MTEB benchmark results from the results directory,
|
| 6 |
-
categorizes tasks, calculates averages, and updates the README.md with
|
| 7 |
-
a comprehensive results table.
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
import json
|
| 11 |
-
import re
|
| 12 |
-
from pathlib import Path
|
| 13 |
-
|
| 14 |
-
# Task category mappings based on MTEB benchmark structure
|
| 15 |
-
TASK_CATEGORIES = {
|
| 16 |
-
# Classification tasks
|
| 17 |
-
"AmazonCounterfactualClassification": "Classification",
|
| 18 |
-
"AmazonReviewsClassification": "Classification",
|
| 19 |
-
"Banking77Classification": "Classification",
|
| 20 |
-
"EmotionClassification": "Classification",
|
| 21 |
-
"ImdbClassification": "Classification",
|
| 22 |
-
"MassiveIntentClassification": "Classification",
|
| 23 |
-
"MassiveScenarioClassification": "Classification",
|
| 24 |
-
"MTOPDomainClassification": "Classification",
|
| 25 |
-
"MTOPIntentClassification": "Classification",
|
| 26 |
-
"ToxicConversationsClassification": "Classification",
|
| 27 |
-
"TweetSentimentExtractionClassification": "Classification",
|
| 28 |
-
# Clustering tasks
|
| 29 |
-
"ArxivClusteringP2P": "Clustering",
|
| 30 |
-
"ArxivClusteringS2S": "Clustering",
|
| 31 |
-
"BiorxivClusteringP2P": "Clustering",
|
| 32 |
-
"BiorxivClusteringS2S": "Clustering",
|
| 33 |
-
"MedrxivClusteringP2P": "Clustering",
|
| 34 |
-
"MedrxivClusteringS2S": "Clustering",
|
| 35 |
-
"RedditClustering": "Clustering",
|
| 36 |
-
"RedditClusteringP2P": "Clustering",
|
| 37 |
-
"StackExchangeClustering": "Clustering",
|
| 38 |
-
"StackExchangeClusteringP2P": "Clustering",
|
| 39 |
-
"TwentyNewsgroupsClustering": "Clustering",
|
| 40 |
-
# Pair Classification tasks
|
| 41 |
-
"SprintDuplicateQuestions": "PairClassification",
|
| 42 |
-
"TwitterSemEval2015": "PairClassification",
|
| 43 |
-
"TwitterURLCorpus": "PairClassification",
|
| 44 |
-
# Reranking tasks
|
| 45 |
-
"AskUbuntuDupQuestions": "Reranking",
|
| 46 |
-
"MindSmallReranking": "Reranking",
|
| 47 |
-
"SciDocsRR": "Reranking",
|
| 48 |
-
"StackOverflowDupQuestions": "Reranking",
|
| 49 |
-
# Retrieval tasks
|
| 50 |
-
"ArguAna": "Retrieval",
|
| 51 |
-
"ClimateFEVER": "Retrieval",
|
| 52 |
-
"CQADupstackRetrieval": "Retrieval",
|
| 53 |
-
"DBPedia": "Retrieval",
|
| 54 |
-
"FEVER": "Retrieval",
|
| 55 |
-
"FiQA2018": "Retrieval",
|
| 56 |
-
"HotpotQA": "Retrieval",
|
| 57 |
-
"MSMARCO": "Retrieval",
|
| 58 |
-
"NFCorpus": "Retrieval",
|
| 59 |
-
"NQ": "Retrieval",
|
| 60 |
-
"QuoraRetrieval": "Retrieval",
|
| 61 |
-
"SCIDOCS": "Retrieval",
|
| 62 |
-
"SciFact": "Retrieval",
|
| 63 |
-
"Touche2020": "Retrieval",
|
| 64 |
-
"TRECCOVID": "Retrieval",
|
| 65 |
-
# Code retrieval tasks
|
| 66 |
-
"CodeSearchNetCCRetrieval": "CodeRetrieval",
|
| 67 |
-
"COIRCodeSearchNetRetrieval": "CodeRetrieval",
|
| 68 |
-
"StackOverflowQA": "CodeRetrieval",
|
| 69 |
-
"AppsRetrieval": "CodeRetrieval",
|
| 70 |
-
"CodeTransOceanContest": "CodeRetrieval",
|
| 71 |
-
"CodeTransOceanDL": "CodeRetrieval",
|
| 72 |
-
"CodeFeedbackMT": "CodeRetrieval",
|
| 73 |
-
"SyntheticText2SQL": "CodeRetrieval",
|
| 74 |
-
"CosQA": "CodeRetrieval",
|
| 75 |
-
# STS (Semantic Textual Similarity) tasks
|
| 76 |
-
"BIOSSES": "STS",
|
| 77 |
-
"SICK-R": "STS",
|
| 78 |
-
"STS12": "STS",
|
| 79 |
-
"STS13": "STS",
|
| 80 |
-
"STS14": "STS",
|
| 81 |
-
"STS15": "STS",
|
| 82 |
-
"STS16": "STS",
|
| 83 |
-
"STS17": "STS",
|
| 84 |
-
"STS22": "STS",
|
| 85 |
-
"STSBenchmark": "STS",
|
| 86 |
-
"SummEval": "STS",
|
| 87 |
-
}
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def load_mteb_results(results_dir: Path) -> dict[str, dict]:
|
| 91 |
-
"""Load all MTEB results from the results directory."""
|
| 92 |
-
results = {}
|
| 93 |
-
|
| 94 |
-
for json_file in results_dir.glob("*.json"):
|
| 95 |
-
if json_file.name == "model_meta.json":
|
| 96 |
-
continue
|
| 97 |
-
|
| 98 |
-
try:
|
| 99 |
-
with json_file.open() as f:
|
| 100 |
-
data = json.load(f)
|
| 101 |
-
task_name = data.get("task_name", json_file.stem)
|
| 102 |
-
results[task_name] = data
|
| 103 |
-
except (json.JSONDecodeError, KeyError):
|
| 104 |
-
pass
|
| 105 |
-
|
| 106 |
-
return results
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
def extract_main_score(result_data: dict) -> float:
|
| 110 |
-
"""Extract the main score from a task result."""
|
| 111 |
-
try:
|
| 112 |
-
scores = result_data["scores"]["test"][0]
|
| 113 |
-
return scores["main_score"]
|
| 114 |
-
except (KeyError, IndexError, TypeError):
|
| 115 |
-
return 0.0
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
def categorize_tasks(results: dict[str, dict]) -> dict[str, list[tuple[str, float]]]:
|
| 119 |
-
"""Categorize tasks and extract their scores."""
|
| 120 |
-
categories: dict[str, list[tuple[str, float]]] = {}
|
| 121 |
-
|
| 122 |
-
for task_name, result_data in results.items():
|
| 123 |
-
# Get category from mapping, or try to infer from task name
|
| 124 |
-
category = TASK_CATEGORIES.get(task_name)
|
| 125 |
-
|
| 126 |
-
if not category:
|
| 127 |
-
# Try to infer category from task name patterns
|
| 128 |
-
if "Classification" in task_name:
|
| 129 |
-
category = "Classification"
|
| 130 |
-
elif "Clustering" in task_name:
|
| 131 |
-
category = "Clustering"
|
| 132 |
-
elif "Retrieval" in task_name or "QA" in task_name:
|
| 133 |
-
category = "Retrieval"
|
| 134 |
-
elif "STS" in task_name or "SICK" in task_name or "BIOSSES" in task_name:
|
| 135 |
-
category = "STS"
|
| 136 |
-
elif "Code" in task_name or "SQL" in task_name:
|
| 137 |
-
category = "CodeRetrieval"
|
| 138 |
-
else:
|
| 139 |
-
category = "Other"
|
| 140 |
-
|
| 141 |
-
score = extract_main_score(result_data)
|
| 142 |
-
|
| 143 |
-
if category not in categories:
|
| 144 |
-
categories[category] = []
|
| 145 |
-
categories[category].append((task_name, score))
|
| 146 |
-
|
| 147 |
-
# Sort tasks within each category
|
| 148 |
-
for category_tasks in categories.values():
|
| 149 |
-
category_tasks.sort(key=lambda x: x[0])
|
| 150 |
-
|
| 151 |
-
return categories
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
def calculate_averages(categories: dict[str, list[tuple[str, float]]]) -> dict[str, float]:
|
| 155 |
-
"""Calculate average scores for each category."""
|
| 156 |
-
averages = {}
|
| 157 |
-
|
| 158 |
-
for category, tasks in categories.items():
|
| 159 |
-
scores = [score for _, score in tasks if score > 0] # Exclude failed tasks (score = 0)
|
| 160 |
-
if scores:
|
| 161 |
-
averages[category] = sum(scores) / len(scores)
|
| 162 |
-
else:
|
| 163 |
-
averages[category] = 0.0
|
| 164 |
-
|
| 165 |
-
return averages
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
def generate_results_table(categories: dict[str, list[tuple[str, float]]], averages: dict[str, float]) -> str:
|
| 169 |
-
"""Generate a markdown table with the results."""
|
| 170 |
-
# Calculate overall average
|
| 171 |
-
all_scores = []
|
| 172 |
-
for tasks in categories.values():
|
| 173 |
-
all_scores.extend([score for _, score in tasks if score > 0])
|
| 174 |
-
|
| 175 |
-
overall_avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
| 176 |
-
|
| 177 |
-
# Create table
|
| 178 |
-
table_lines = [
|
| 179 |
-
"## MTEB Benchmark Results",
|
| 180 |
-
"",
|
| 181 |
-
f"**Overall Average Score: {overall_avg:.4f}**",
|
| 182 |
-
"",
|
| 183 |
-
"| Category | Task | Score |",
|
| 184 |
-
"|----------|------|-------|",
|
| 185 |
-
]
|
| 186 |
-
|
| 187 |
-
# Sort categories for consistent ordering
|
| 188 |
-
category_order = [
|
| 189 |
-
"Classification",
|
| 190 |
-
"Clustering",
|
| 191 |
-
"PairClassification",
|
| 192 |
-
"Reranking",
|
| 193 |
-
"Retrieval",
|
| 194 |
-
"CodeRetrieval",
|
| 195 |
-
"STS",
|
| 196 |
-
"Other",
|
| 197 |
-
]
|
| 198 |
-
|
| 199 |
-
for category in category_order:
|
| 200 |
-
if category not in categories:
|
| 201 |
-
continue
|
| 202 |
-
|
| 203 |
-
tasks = categories[category]
|
| 204 |
-
if not tasks:
|
| 205 |
-
continue
|
| 206 |
-
|
| 207 |
-
# Add category average row
|
| 208 |
-
avg_score = averages[category]
|
| 209 |
-
table_lines.append(f"| **{category}** | **Average** | **{avg_score:.4f}** |")
|
| 210 |
-
|
| 211 |
-
# Add individual tasks
|
| 212 |
-
for task_name, score in tasks:
|
| 213 |
-
if score > 0: # Only show successful tasks
|
| 214 |
-
table_lines.append(f"| | {task_name} | {score:.4f} |")
|
| 215 |
-
else:
|
| 216 |
-
table_lines.append(f"| | {task_name} | Failed |")
|
| 217 |
-
|
| 218 |
-
table_lines.append("| | | |") # Empty row for spacing
|
| 219 |
-
|
| 220 |
-
# Add summary statistics
|
| 221 |
-
table_lines.extend(
|
| 222 |
-
[
|
| 223 |
-
"",
|
| 224 |
-
"### Summary Statistics",
|
| 225 |
-
"",
|
| 226 |
-
f"- **Total Tasks**: {sum(len(tasks) for tasks in categories.values())}",
|
| 227 |
-
f"- **Successful Tasks**: {len(all_scores)}",
|
| 228 |
-
f"- **Failed Tasks**: {sum(len(tasks) for tasks in categories.values()) - len(all_scores)}",
|
| 229 |
-
f"- **Overall Average**: {overall_avg:.4f}",
|
| 230 |
-
"",
|
| 231 |
-
"### Category Averages",
|
| 232 |
-
"",
|
| 233 |
-
]
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
-
for category in category_order:
|
| 237 |
-
if category in averages and categories.get(category):
|
| 238 |
-
avg = averages[category]
|
| 239 |
-
task_count = len([s for _, s in categories[category] if s > 0])
|
| 240 |
-
table_lines.append(f"- **{category}**: {avg:.4f} ({task_count} tasks)")
|
| 241 |
-
|
| 242 |
-
return "\n".join(table_lines)
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
def update_readme(results_table: str, readme_path: Path = Path("README.md")) -> None:
|
| 246 |
-
"""Update the README.md file with the results table."""
|
| 247 |
-
if not readme_path.exists():
|
| 248 |
-
return
|
| 249 |
-
|
| 250 |
-
# Read current README
|
| 251 |
-
with readme_path.open() as f:
|
| 252 |
-
content = f.read()
|
| 253 |
-
|
| 254 |
-
# Find the insertion point or replace existing MTEB results
|
| 255 |
-
mteb_pattern = r"## MTEB Benchmark Results.*?(?=\n## |\n# |\Z)"
|
| 256 |
-
|
| 257 |
-
if re.search(mteb_pattern, content, re.DOTALL):
|
| 258 |
-
# Replace existing MTEB results section
|
| 259 |
-
new_content = re.sub(mteb_pattern, results_table, content, flags=re.DOTALL)
|
| 260 |
-
# Find a good insertion point (before Acknowledgments section or at the end)
|
| 261 |
-
elif "## Acknowledgments" in content:
|
| 262 |
-
new_content = content.replace("## Acknowledgments", f"{results_table}\n\n## Acknowledgments")
|
| 263 |
-
elif "## License" in content:
|
| 264 |
-
new_content = content.replace("## License", f"{results_table}\n\n## License")
|
| 265 |
-
else:
|
| 266 |
-
# Add at the end
|
| 267 |
-
new_content = f"{content}\n\n{results_table}"
|
| 268 |
-
|
| 269 |
-
# Write updated README
|
| 270 |
-
with readme_path.open("w") as f:
|
| 271 |
-
f.write(new_content)
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
def main() -> None:
|
| 275 |
-
"""Main function to analyze MTEB results and update README."""
|
| 276 |
-
results_dir = Path("mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled")
|
| 277 |
-
|
| 278 |
-
if not results_dir.exists():
|
| 279 |
-
return
|
| 280 |
-
|
| 281 |
-
results = load_mteb_results(results_dir)
|
| 282 |
-
|
| 283 |
-
if not results:
|
| 284 |
-
return
|
| 285 |
-
|
| 286 |
-
categories = categorize_tasks(results)
|
| 287 |
-
|
| 288 |
-
averages = calculate_averages(categories)
|
| 289 |
-
|
| 290 |
-
results_table = generate_results_table(categories, averages)
|
| 291 |
-
|
| 292 |
-
update_readme(results_table)
|
| 293 |
-
|
| 294 |
-
# Print summary to console
|
| 295 |
-
|
| 296 |
-
sum(len(tasks) for tasks in categories.values())
|
| 297 |
-
successful_tasks = sum(len([s for _, s in tasks if s > 0]) for tasks in categories.values())
|
| 298 |
-
|
| 299 |
-
if successful_tasks > 0:
|
| 300 |
-
all_scores = []
|
| 301 |
-
for tasks in categories.values():
|
| 302 |
-
all_scores.extend([score for _, score in tasks if score > 0])
|
| 303 |
-
sum(all_scores) / len(all_scores)
|
| 304 |
-
|
| 305 |
-
for category, tasks in categories.items():
|
| 306 |
-
len([s for _, s in tasks if s > 0])
|
| 307 |
-
averages.get(category, 0.0)
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
if __name__ == "__main__":
|
| 311 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|