Specific-Cognito commited on
Commit
0079cfb
·
verified ·
1 Parent(s): cc8ba82

Create evaluate_embeddings.py

Browse files
Files changed (1) hide show
  1. evaluate_embeddings.py +323 -0
evaluate_embeddings.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion-V1-Embeddings Evaluation Script
3
+ Evaluate embedding model quality on standard benchmarks
4
+ """
5
+
6
+ import json
7
+ import logging
8
+ import numpy as np
9
+ from typing import List, Dict, Tuple
10
+ from dataclasses import dataclass, asdict
11
+ from pathlib import Path
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class EvaluationMetrics:
19
+ """Container for evaluation metrics."""
20
+ sts_correlation: float = 0.0
21
+ retrieval_accuracy: float = 0.0
22
+ clustering_score: float = 0.0
23
+ speed_sentences_per_sec: float = 0.0
24
+ model_size_mb: float = 0.0
25
+
26
+ def to_dict(self):
27
+ return asdict(self)
28
+
29
+
30
+ class EmbeddingsEvaluator:
31
+ """Evaluate embeddings model."""
32
+
33
+ def __init__(self, model_name: str = "DeepXR/Helion-V1-embeddings"):
34
+ from sentence_transformers import SentenceTransformer
35
+
36
+ logger.info(f"Loading model: {model_name}")
37
+ self.model = SentenceTransformer(model_name)
38
+ self.model_name = model_name
39
+
40
+ def evaluate_sts(self) -> float:
41
+ """
42
+ Evaluate on Semantic Textual Similarity benchmark.
43
+
44
+ Returns:
45
+ Spearman correlation score
46
+ """
47
+ # Sample STS test pairs (sentence1, sentence2, similarity_score)
48
+ test_pairs = [
49
+ ("A man is playing a guitar", "A person is playing music", 0.7),
50
+ ("A dog is running in a field", "A cat is sleeping", 0.2),
51
+ ("The weather is nice today", "It's a beautiful day", 0.9),
52
+ ("Programming in Python", "Coding with Python language", 0.95),
53
+ ("Machine learning model", "Deep neural network", 0.6),
54
+ ]
55
+
56
+ from scipy.stats import spearmanr
57
+
58
+ predicted_scores = []
59
+ actual_scores = []
60
+
61
+ for sent1, sent2, actual in test_pairs:
62
+ emb1 = self.model.encode(sent1)
63
+ emb2 = self.model.encode(sent2)
64
+
65
+ # Cosine similarity
66
+ similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
67
+
68
+ predicted_scores.append(similarity)
69
+ actual_scores.append(actual)
70
+
71
+ correlation, _ = spearmanr(predicted_scores, actual_scores)
72
+ logger.info(f"STS Correlation: {correlation:.4f}")
73
+
74
+ return correlation
75
+
76
+ def evaluate_retrieval(self) -> float:
77
+ """
78
+ Evaluate retrieval accuracy.
79
+
80
+ Returns:
81
+ Accuracy score
82
+ """
83
+ # Query-document pairs with relevance
84
+ queries_and_docs = [
85
+ {
86
+ "query": "How to learn Python programming?",
87
+ "relevant": ["Python tutorial for beginners", "Learn Python step by step"],
88
+ "irrelevant": ["Java programming guide", "Database design tutorial"]
89
+ },
90
+ {
91
+ "query": "Best restaurants in Paris",
92
+ "relevant": ["Top dining spots in Paris", "Where to eat in Paris"],
93
+ "irrelevant": ["London travel guide", "New York attractions"]
94
+ },
95
+ {
96
+ "query": "Machine learning basics",
97
+ "relevant": ["Introduction to ML", "ML fundamentals explained"],
98
+ "irrelevant": ["Cooking recipes", "Gardening tips"]
99
+ }
100
+ ]
101
+
102
+ correct = 0
103
+ total = 0
104
+
105
+ for item in queries_and_docs:
106
+ query = item["query"]
107
+ all_docs = item["relevant"] + item["irrelevant"]
108
+
109
+ query_emb = self.model.encode(query)
110
+ doc_embs = self.model.encode(all_docs)
111
+
112
+ # Calculate similarities
113
+ similarities = [
114
+ np.dot(query_emb, doc_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(doc_emb))
115
+ for doc_emb in doc_embs
116
+ ]
117
+
118
+ # Check if relevant docs rank higher
119
+ num_relevant = len(item["relevant"])
120
+ top_indices = np.argsort(similarities)[-num_relevant:]
121
+
122
+ # Count correct retrievals
123
+ correct += sum(1 for idx in top_indices if idx < num_relevant)
124
+ total += num_relevant
125
+
126
+ accuracy = correct / total
127
+ logger.info(f"Retrieval Accuracy: {accuracy:.4f}")
128
+
129
+ return accuracy
130
+
131
+ def evaluate_speed(self, num_sentences: int = 1000) -> float:
132
+ """
133
+ Measure encoding speed.
134
+
135
+ Args:
136
+ num_sentences: Number of sentences to encode
137
+
138
+ Returns:
139
+ Sentences per second
140
+ """
141
+ import time
142
+
143
+ # Generate test sentences
144
+ test_sentences = [
145
+ f"This is test sentence number {i} for speed evaluation."
146
+ for i in range(num_sentences)
147
+ ]
148
+
149
+ # Warmup
150
+ _ = self.model.encode(test_sentences[:10])
151
+
152
+ # Measure
153
+ start_time = time.time()
154
+ _ = self.model.encode(test_sentences, batch_size=32)
155
+ elapsed = time.time() - start_time
156
+
157
+ speed = num_sentences / elapsed
158
+ logger.info(f"Speed: {speed:.2f} sentences/sec")
159
+
160
+ return speed
161
+
162
+ def evaluate_clustering(self) -> float:
163
+ """
164
+ Evaluate clustering quality.
165
+
166
+ Returns:
167
+ Clustering score (silhouette score)
168
+ """
169
+ # Sample documents in categories
170
+ documents = {
171
+ "tech": [
172
+ "Machine learning algorithms",
173
+ "Python programming tutorial",
174
+ "Data science basics"
175
+ ],
176
+ "food": [
177
+ "Italian pasta recipes",
178
+ "How to bake bread",
179
+ "Cooking techniques"
180
+ ],
181
+ "travel": [
182
+ "Best places to visit in Europe",
183
+ "Travel tips for beginners",
184
+ "Budget travel guide"
185
+ ]
186
+ }
187
+
188
+ all_docs = []
189
+ labels = []
190
+
191
+ for category, docs in documents.items():
192
+ all_docs.extend(docs)
193
+ labels.extend([category] * len(docs))
194
+
195
+ # Generate embeddings
196
+ embeddings = self.model.encode(all_docs)
197
+
198
+ # Calculate silhouette score
199
+ from sklearn.metrics import silhouette_score
200
+ from sklearn.preprocessing import LabelEncoder
201
+
202
+ le = LabelEncoder()
203
+ numeric_labels = le.fit_transform(labels)
204
+
205
+ score = silhouette_score(embeddings, numeric_labels)
206
+ logger.info(f"Clustering Score: {score:.4f}")
207
+
208
+ return score
209
+
210
+ def get_model_size(self) -> float:
211
+ """
212
+ Get model size in MB.
213
+
214
+ Returns:
215
+ Model size in megabytes
216
+ """
217
+ # Estimate from parameters
218
+ num_params = sum(p.numel() for p in self.model.parameters())
219
+ # Assuming float32 (4 bytes per parameter)
220
+ size_mb = (num_params * 4) / (1024 * 1024)
221
+
222
+ logger.info(f"Model Size: {size_mb:.2f} MB")
223
+
224
+ return size_mb
225
+
226
+ def run_full_evaluation(self, output_file: str = "embeddings_eval_results.json") -> EvaluationMetrics:
227
+ """
228
+ Run complete evaluation suite.
229
+
230
+ Args:
231
+ output_file: Output file for results
232
+
233
+ Returns:
234
+ EvaluationMetrics object
235
+ """
236
+ logger.info("="*60)
237
+ logger.info("Starting Full Evaluation")
238
+ logger.info("="*60)
239
+
240
+ metrics = EvaluationMetrics()
241
+
242
+ # Run evaluations
243
+ try:
244
+ metrics.sts_correlation = self.evaluate_sts()
245
+ except Exception as e:
246
+ logger.error(f"STS evaluation failed: {e}")
247
+
248
+ try:
249
+ metrics.retrieval_accuracy = self.evaluate_retrieval()
250
+ except Exception as e:
251
+ logger.error(f"Retrieval evaluation failed: {e}")
252
+
253
+ try:
254
+ metrics.clustering_score = self.evaluate_clustering()
255
+ except Exception as e:
256
+ logger.error(f"Clustering evaluation failed: {e}")
257
+
258
+ try:
259
+ metrics.speed_sentences_per_sec = self.evaluate_speed()
260
+ except Exception as e:
261
+ logger.error(f"Speed evaluation failed: {e}")
262
+
263
+ try:
264
+ metrics.model_size_mb = self.get_model_size()
265
+ except Exception as e:
266
+ logger.error(f"Size calculation failed: {e}")
267
+
268
+ # Save results
269
+ results = {
270
+ "model": self.model_name,
271
+ "metrics": metrics.to_dict(),
272
+ "timestamp": str(Path().resolve())
273
+ }
274
+
275
+ with open(output_file, 'w') as f:
276
+ json.dump(results, f, indent=2)
277
+
278
+ logger.info("="*60)
279
+ logger.info("Evaluation Complete")
280
+ logger.info("="*60)
281
+ logger.info(f"Results saved to: {output_file}")
282
+
283
+ return metrics
284
+
285
+
286
+ def main():
287
+ """Main evaluation function."""
288
+ import argparse
289
+
290
+ parser = argparse.ArgumentParser(
291
+ description="Evaluate Helion-V1-Embeddings"
292
+ )
293
+ parser.add_argument(
294
+ "--model",
295
+ default="DeepXR/Helion-V1-embeddings",
296
+ help="Model to evaluate"
297
+ )
298
+ parser.add_argument(
299
+ "--output",
300
+ default="embeddings_eval_results.json",
301
+ help="Output file for results"
302
+ )
303
+
304
+ args = parser.parse_args()
305
+
306
+ # Run evaluation
307
+ evaluator = EmbeddingsEvaluator(args.model)
308
+ metrics = evaluator.run_full_evaluation(args.output)
309
+
310
+ # Print summary
311
+ print("\n" + "="*60)
312
+ print("EVALUATION RESULTS")
313
+ print("="*60)
314
+ print(f"STS Correlation: {metrics.sts_correlation:.4f}")
315
+ print(f"Retrieval Accuracy: {metrics.retrieval_accuracy:.4f}")
316
+ print(f"Clustering Score: {metrics.clustering_score:.4f}")
317
+ print(f"Speed: {metrics.speed_sentences_per_sec:.0f} sent/sec")
318
+ print(f"Model Size: {metrics.model_size_mb:.2f} MB")
319
+ print("="*60)
320
+
321
+
322
+ if __name__ == "__main__":
323
+ main()