Sarthak commited on
Commit
7b072e5
·
1 Parent(s): aaaf803

chore: remove MTEB results analysis script

Browse files

This commit removes the script for analyzing MTEB results. The script was responsible for processing benchmark results, categorizing tasks, calculating averages, and updating the README.md file with a results table. The script is no longer needed.

Files changed (1) hide show
  1. analyze_mteb_results.py +0 -311
analyze_mteb_results.py DELETED
@@ -1,311 +0,0 @@
1
- #!/usr/bin/env python
2
- """
3
- MTEB Results Analysis Script.
4
-
5
- This script analyzes MTEB benchmark results from the results directory,
6
- categorizes tasks, calculates averages, and updates the README.md with
7
- a comprehensive results table.
8
- """
9
-
10
- import json
11
- import re
12
- from pathlib import Path
13
-
14
- # Task category mappings based on MTEB benchmark structure
15
- TASK_CATEGORIES = {
16
- # Classification tasks
17
- "AmazonCounterfactualClassification": "Classification",
18
- "AmazonReviewsClassification": "Classification",
19
- "Banking77Classification": "Classification",
20
- "EmotionClassification": "Classification",
21
- "ImdbClassification": "Classification",
22
- "MassiveIntentClassification": "Classification",
23
- "MassiveScenarioClassification": "Classification",
24
- "MTOPDomainClassification": "Classification",
25
- "MTOPIntentClassification": "Classification",
26
- "ToxicConversationsClassification": "Classification",
27
- "TweetSentimentExtractionClassification": "Classification",
28
- # Clustering tasks
29
- "ArxivClusteringP2P": "Clustering",
30
- "ArxivClusteringS2S": "Clustering",
31
- "BiorxivClusteringP2P": "Clustering",
32
- "BiorxivClusteringS2S": "Clustering",
33
- "MedrxivClusteringP2P": "Clustering",
34
- "MedrxivClusteringS2S": "Clustering",
35
- "RedditClustering": "Clustering",
36
- "RedditClusteringP2P": "Clustering",
37
- "StackExchangeClustering": "Clustering",
38
- "StackExchangeClusteringP2P": "Clustering",
39
- "TwentyNewsgroupsClustering": "Clustering",
40
- # Pair Classification tasks
41
- "SprintDuplicateQuestions": "PairClassification",
42
- "TwitterSemEval2015": "PairClassification",
43
- "TwitterURLCorpus": "PairClassification",
44
- # Reranking tasks
45
- "AskUbuntuDupQuestions": "Reranking",
46
- "MindSmallReranking": "Reranking",
47
- "SciDocsRR": "Reranking",
48
- "StackOverflowDupQuestions": "Reranking",
49
- # Retrieval tasks
50
- "ArguAna": "Retrieval",
51
- "ClimateFEVER": "Retrieval",
52
- "CQADupstackRetrieval": "Retrieval",
53
- "DBPedia": "Retrieval",
54
- "FEVER": "Retrieval",
55
- "FiQA2018": "Retrieval",
56
- "HotpotQA": "Retrieval",
57
- "MSMARCO": "Retrieval",
58
- "NFCorpus": "Retrieval",
59
- "NQ": "Retrieval",
60
- "QuoraRetrieval": "Retrieval",
61
- "SCIDOCS": "Retrieval",
62
- "SciFact": "Retrieval",
63
- "Touche2020": "Retrieval",
64
- "TRECCOVID": "Retrieval",
65
- # Code retrieval tasks
66
- "CodeSearchNetCCRetrieval": "CodeRetrieval",
67
- "COIRCodeSearchNetRetrieval": "CodeRetrieval",
68
- "StackOverflowQA": "CodeRetrieval",
69
- "AppsRetrieval": "CodeRetrieval",
70
- "CodeTransOceanContest": "CodeRetrieval",
71
- "CodeTransOceanDL": "CodeRetrieval",
72
- "CodeFeedbackMT": "CodeRetrieval",
73
- "SyntheticText2SQL": "CodeRetrieval",
74
- "CosQA": "CodeRetrieval",
75
- # STS (Semantic Textual Similarity) tasks
76
- "BIOSSES": "STS",
77
- "SICK-R": "STS",
78
- "STS12": "STS",
79
- "STS13": "STS",
80
- "STS14": "STS",
81
- "STS15": "STS",
82
- "STS16": "STS",
83
- "STS17": "STS",
84
- "STS22": "STS",
85
- "STSBenchmark": "STS",
86
- "SummEval": "STS",
87
- }
88
-
89
-
90
- def load_mteb_results(results_dir: Path) -> dict[str, dict]:
91
- """Load all MTEB results from the results directory."""
92
- results = {}
93
-
94
- for json_file in results_dir.glob("*.json"):
95
- if json_file.name == "model_meta.json":
96
- continue
97
-
98
- try:
99
- with json_file.open() as f:
100
- data = json.load(f)
101
- task_name = data.get("task_name", json_file.stem)
102
- results[task_name] = data
103
- except (json.JSONDecodeError, KeyError):
104
- pass
105
-
106
- return results
107
-
108
-
109
- def extract_main_score(result_data: dict) -> float:
110
- """Extract the main score from a task result."""
111
- try:
112
- scores = result_data["scores"]["test"][0]
113
- return scores["main_score"]
114
- except (KeyError, IndexError, TypeError):
115
- return 0.0
116
-
117
-
118
- def categorize_tasks(results: dict[str, dict]) -> dict[str, list[tuple[str, float]]]:
119
- """Categorize tasks and extract their scores."""
120
- categories: dict[str, list[tuple[str, float]]] = {}
121
-
122
- for task_name, result_data in results.items():
123
- # Get category from mapping, or try to infer from task name
124
- category = TASK_CATEGORIES.get(task_name)
125
-
126
- if not category:
127
- # Try to infer category from task name patterns
128
- if "Classification" in task_name:
129
- category = "Classification"
130
- elif "Clustering" in task_name:
131
- category = "Clustering"
132
- elif "Retrieval" in task_name or "QA" in task_name:
133
- category = "Retrieval"
134
- elif "STS" in task_name or "SICK" in task_name or "BIOSSES" in task_name:
135
- category = "STS"
136
- elif "Code" in task_name or "SQL" in task_name:
137
- category = "CodeRetrieval"
138
- else:
139
- category = "Other"
140
-
141
- score = extract_main_score(result_data)
142
-
143
- if category not in categories:
144
- categories[category] = []
145
- categories[category].append((task_name, score))
146
-
147
- # Sort tasks within each category
148
- for category_tasks in categories.values():
149
- category_tasks.sort(key=lambda x: x[0])
150
-
151
- return categories
152
-
153
-
154
- def calculate_averages(categories: dict[str, list[tuple[str, float]]]) -> dict[str, float]:
155
- """Calculate average scores for each category."""
156
- averages = {}
157
-
158
- for category, tasks in categories.items():
159
- scores = [score for _, score in tasks if score > 0] # Exclude failed tasks (score = 0)
160
- if scores:
161
- averages[category] = sum(scores) / len(scores)
162
- else:
163
- averages[category] = 0.0
164
-
165
- return averages
166
-
167
-
168
- def generate_results_table(categories: dict[str, list[tuple[str, float]]], averages: dict[str, float]) -> str:
169
- """Generate a markdown table with the results."""
170
- # Calculate overall average
171
- all_scores = []
172
- for tasks in categories.values():
173
- all_scores.extend([score for _, score in tasks if score > 0])
174
-
175
- overall_avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
176
-
177
- # Create table
178
- table_lines = [
179
- "## MTEB Benchmark Results",
180
- "",
181
- f"**Overall Average Score: {overall_avg:.4f}**",
182
- "",
183
- "| Category | Task | Score |",
184
- "|----------|------|-------|",
185
- ]
186
-
187
- # Sort categories for consistent ordering
188
- category_order = [
189
- "Classification",
190
- "Clustering",
191
- "PairClassification",
192
- "Reranking",
193
- "Retrieval",
194
- "CodeRetrieval",
195
- "STS",
196
- "Other",
197
- ]
198
-
199
- for category in category_order:
200
- if category not in categories:
201
- continue
202
-
203
- tasks = categories[category]
204
- if not tasks:
205
- continue
206
-
207
- # Add category average row
208
- avg_score = averages[category]
209
- table_lines.append(f"| **{category}** | **Average** | **{avg_score:.4f}** |")
210
-
211
- # Add individual tasks
212
- for task_name, score in tasks:
213
- if score > 0: # Only show successful tasks
214
- table_lines.append(f"| | {task_name} | {score:.4f} |")
215
- else:
216
- table_lines.append(f"| | {task_name} | Failed |")
217
-
218
- table_lines.append("| | | |") # Empty row for spacing
219
-
220
- # Add summary statistics
221
- table_lines.extend(
222
- [
223
- "",
224
- "### Summary Statistics",
225
- "",
226
- f"- **Total Tasks**: {sum(len(tasks) for tasks in categories.values())}",
227
- f"- **Successful Tasks**: {len(all_scores)}",
228
- f"- **Failed Tasks**: {sum(len(tasks) for tasks in categories.values()) - len(all_scores)}",
229
- f"- **Overall Average**: {overall_avg:.4f}",
230
- "",
231
- "### Category Averages",
232
- "",
233
- ]
234
- )
235
-
236
- for category in category_order:
237
- if category in averages and categories.get(category):
238
- avg = averages[category]
239
- task_count = len([s for _, s in categories[category] if s > 0])
240
- table_lines.append(f"- **{category}**: {avg:.4f} ({task_count} tasks)")
241
-
242
- return "\n".join(table_lines)
243
-
244
-
245
- def update_readme(results_table: str, readme_path: Path = Path("README.md")) -> None:
246
- """Update the README.md file with the results table."""
247
- if not readme_path.exists():
248
- return
249
-
250
- # Read current README
251
- with readme_path.open() as f:
252
- content = f.read()
253
-
254
- # Find the insertion point or replace existing MTEB results
255
- mteb_pattern = r"## MTEB Benchmark Results.*?(?=\n## |\n# |\Z)"
256
-
257
- if re.search(mteb_pattern, content, re.DOTALL):
258
- # Replace existing MTEB results section
259
- new_content = re.sub(mteb_pattern, results_table, content, flags=re.DOTALL)
260
- # Find a good insertion point (before Acknowledgments section or at the end)
261
- elif "## Acknowledgments" in content:
262
- new_content = content.replace("## Acknowledgments", f"{results_table}\n\n## Acknowledgments")
263
- elif "## License" in content:
264
- new_content = content.replace("## License", f"{results_table}\n\n## License")
265
- else:
266
- # Add at the end
267
- new_content = f"{content}\n\n{results_table}"
268
-
269
- # Write updated README
270
- with readme_path.open("w") as f:
271
- f.write(new_content)
272
-
273
-
274
- def main() -> None:
275
- """Main function to analyze MTEB results and update README."""
276
- results_dir = Path("mteb_results/gte-Qwen2-7B-instruct-M2V-Distilled/distilled")
277
-
278
- if not results_dir.exists():
279
- return
280
-
281
- results = load_mteb_results(results_dir)
282
-
283
- if not results:
284
- return
285
-
286
- categories = categorize_tasks(results)
287
-
288
- averages = calculate_averages(categories)
289
-
290
- results_table = generate_results_table(categories, averages)
291
-
292
- update_readme(results_table)
293
-
294
- # Print summary to console
295
-
296
- sum(len(tasks) for tasks in categories.values())
297
- successful_tasks = sum(len([s for _, s in tasks if s > 0]) for tasks in categories.values())
298
-
299
- if successful_tasks > 0:
300
- all_scores = []
301
- for tasks in categories.values():
302
- all_scores.extend([score for _, score in tasks if score > 0])
303
- sum(all_scores) / len(all_scores)
304
-
305
- for category, tasks in categories.items():
306
- len([s for _, s in tasks if s > 0])
307
- averages.get(category, 0.0)
308
-
309
-
310
- if __name__ == "__main__":
311
- main()