diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..57e5ac8a9ac03ede4eaa68957b806405e85c8043 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +victord/sub19/models/bge-m3/tokenizer.json filter=lfs diff=lfs merge=lfs -text +victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer.json filter=lfs diff=lfs merge=lfs -text +victord/sub19/models/multilingual-e5-large_pseudo_full/tokenizer.json filter=lfs diff=lfs merge=lfs -text +victord/sub19/models/multilingual-e5-large-instruct/tokenizer.json filter=lfs diff=lfs merge=lfs -text +victord/sub19/models/snowflake-arctic-embed-l-v2.0/tokenizer.json filter=lfs diff=lfs merge=lfs -text +victord/sub19/models/Solon-embeddings-large-0.1/tokenizer.json filter=lfs diff=lfs merge=lfs -text +victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/victord/sub19/model.py b/victord/sub19/model.py new file mode 100644 index 0000000000000000000000000000000000000000..352d868386963779d0a7f16e9b997d2dec87e1b5 --- /dev/null +++ b/victord/sub19/model.py @@ -0,0 +1,1712 @@ + +import os +os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") +os.environ.setdefault("HF_HUB_OFFLINE", "1") # never hit the Hub +os.environ.setdefault("TRANSFORMERS_OFFLINE", "1") # Transformers offline +os.environ.setdefault("HF_DATASETS_OFFLINE", "1") # Datasets offline +os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") +os.environ.setdefault("DISABLE_TQDM", '1') +os.environ.setdefault("TQDM_DISABLE", '1') +os.environ["TQDM_DISABLE"] = "1" +os.environ["DISABLE_TQDM"] = "1" +import json +import torch +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +import torch.nn.functional as F +import numpy as np +from transformers import AutoTokenizer, AutoModel +from sklearn.metrics.pairwise import cosine_similarity +import string +from sklearn.feature_extraction import _stop_words + +import time + +import bm25s + + +class BM25sRetriever: + def __init__(self, passages, ids): + self.corpus_tokens = bm25s.tokenize(passages, show_progress=False) + self.retriever = bm25s.BM25() + self.retriever.index(self.corpus_tokens, show_progress=False, leave_progress=False) + self.corpus_ids = ids + self.id2idx = {} + for i in range(len(self.corpus_ids)): + self.id2idx[self.corpus_ids[i]] = i + + def get_scores(self, query_text): + query_tokens = bm25s.tokenize(query_text) + return self.retriever.retrieve(query_tokens, sorted=False, k=len(self.corpus_ids), show_progress=False, leave_progress=False)[1][0] + + + + +class E5Retriever: + def __init__(self, model_name=None, device=None): + """ + Initializes the E5 retriever using the multilingual E5 base model. + """ + # Use local model + if model_name is None: + # local_model_path = os.path.join('sub/models', 'multilingual-e5-large') # 'multilingual-e5-large' + local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'multilingual-e5-large_pseudo_full') # 'multilingual-e5-large' + if os.path.isdir(local_model_path): + model_name = local_model_path + print(f"Using local E5 model from: {model_name}") + + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + + # Clear GPU cache before loading model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + print(f"Loading E5 multilingual model on device: {self.device}") + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device) + self.model.eval() + + # Clear cache after model loading + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + self.corpus_ids = [] + self.corpus_embeddings = None + + def embed_texts(self, texts, is_query=False, batch_size=32): + """ + Generates embeddings for texts using E5 model with proper prefixes. + E5 requires specific prefixes for queries vs passages. + """ + # E5 model requires specific prefixes + if is_query: + # Add query prefix for E5 + prefixed_texts = [f"query: {text.strip()}" for text in texts] + else: + # Add passage prefix for E5 + prefixed_texts = [f"passage: {text.strip()}" for text in texts] + + all_embeddings = [] + total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size + + for i in range(0, len(prefixed_texts), batch_size): + batch_num = i // batch_size + 1 + if not is_query and batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_texts = prefixed_texts[i:i + batch_size] + + try: + encoded = self.tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=512, + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + + # E5 uses mean pooling with attention mask + attention_mask = encoded['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + + # L2 normalize embeddings (important for E5) + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for single_text in batch_texts: + try: + encoded = self.tokenizer( + [single_text], + padding=True, + truncation=True, + max_length=512, + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + attention_mask = encoded['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + # E5-base has 768 dimensions. Large 1024 + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + return torch.cat(all_embeddings, dim=0).numpy() + + def prepare_corpus(self, texts, ids, batch_size=32): + + # Add passage prefix for E5 + prefixed_texts = [f"passage: {text.strip()}" for text in texts] + + all_embeddings = [] + + all_tokens = [] + all_ids = [] + all_len = [] + + prefix_len = 3 + suffix_len = 1 + max_length = 512 + max_passage = max_length - prefix_len - suffix_len + overlap = max_passage // 2 + + for i in range(len(prefixed_texts)): + encoded = self.tokenizer( + [prefixed_texts[i]], + padding=False, + truncation=False, + # max_length=512, + # return_tensors='pt' + ) + + if len(encoded['input_ids'][0]) > max_length: + _idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len)) + + i0 = 0 + while i0 < len(_idxs) - overlap: + i1 = min(i0+max_passage, len(_idxs)) + all_ids.append((ids[i], (i0, i1, len(_idxs)))) + all_tokens.append({ + 'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:], + 'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:] + }) + all_len.append(i1-i0) + i0 += max_passage - overlap + else: + all_ids.append((ids[i], None)) + all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]}) + all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len) + + + total_batches = (len(all_tokens) + batch_size - 1) // batch_size + + all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0])) + + + for i in range(0, len(all_tokens), batch_size): + batch_num = i // batch_size + 1 + if batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_tokens = all_tokens[i:i + batch_size] + + batch_max = max([len(ids['input_ids']) for ids in batch_tokens]) + encoded = dict() + encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device) + encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device) + + + + try: + + with torch.no_grad(): + model_output = self.model(**encoded) + + # E5 uses mean pooling with attention mask + attention_mask = encoded['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + + # L2 normalize embeddings (important for E5) + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for j in range(len(encoded)): + try: + + encoded0 = encoded[j:j+1] + + with torch.no_grad(): + model_output = self.model(**encoded0) + attention_mask = encoded0['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded0, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + # E5-base has 768 dimensions. Large 1024 + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + id2idx = {} + for i in range(len(all_ids)): + if all_ids[i][0] not in id2idx: + id2idx[all_ids[i][0]] = [] + id2idx[all_ids[i][0]].append(i) + + return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy() + + + +class E5InstructRetriever: + def __init__(self, model_name=None, device=None): + """ + Initializes the E5 Instruct retriever using the multilingual E5 Instruct large model. + """ + # Use local model + if model_name is None: + # local_model_path = os.path.join('sub/models', 'multilingual-e5-large-instruct') + local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'multilingual-e5-large-instruct') + if os.path.isdir(local_model_path): + model_name = local_model_path + print(f"Using local E5 Instruct model from: {model_name}") + + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + + # Clear GPU cache before loading model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + print(f"Loading E5 Instruct multilingual model on device: {self.device}") + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device) # torch.float16 + self.model.eval() + + # Clear cache after model loading + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + self.corpus_ids = [] + self.corpus_embeddings = None + + def embed_texts(self, texts, is_query=False, batch_size=32): + """ + Generates embeddings for texts using E5 Instruct model with proper prefixes. + E5 Instruct requires specific prefixes for queries vs passages. + """ + task = 'Given a web search query, retrieve relevant passages that answer the query' + # E5 model requires specific prefixes + if is_query: + # Add query prefix for E5 + prefixed_texts = [f"Instruct: {task}\nQuery: {text.strip()}" for text in texts] + else: + # Add passage prefix for E5 + prefixed_texts = [f"{text.strip()}" for text in texts] + + all_embeddings = [] + total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size + + for i in range(0, len(prefixed_texts), batch_size): + batch_num = i // batch_size + 1 + if not is_query and batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_texts = prefixed_texts[i:i + batch_size] + + try: + encoded = self.tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=512, + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + + # E5 uses mean pooling with attention mask + attention_mask = encoded['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + + # L2 normalize embeddings (important for E5) + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for single_text in batch_texts: + try: + encoded = self.tokenizer( + [single_text], + padding=True, + truncation=True, + max_length=512, + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + attention_mask = encoded['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + # E5-base has 768 dimensions. Large 1024 + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + return torch.cat(all_embeddings, dim=0).numpy() + + def prepare_corpus(self, texts, ids, batch_size=32): + + # Add passage prefix for E5 + prefixed_texts = [f"{text.strip()}" for text in texts] + + all_embeddings = [] + + all_tokens = [] + all_ids = [] + all_len = [] + + prefix_len = 1 + suffix_len = 1 + max_length = 512 + max_passage = max_length - prefix_len - suffix_len + overlap = max_passage // 2 + + for i in range(len(prefixed_texts)): # tqdm( + encoded = self.tokenizer( + [prefixed_texts[i]], + padding=False, + truncation=False, + # max_length=512, + # return_tensors='pt' + ) + + if len(encoded['input_ids'][0]) > max_length: + _idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len)) + + i0 = 0 + while i0 < len(_idxs) - overlap: + i1 = min(i0+max_passage, len(_idxs)) + all_ids.append((ids[i], (i0, i1, len(_idxs)))) + all_tokens.append({ + 'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:], + 'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:] + }) + all_len.append(i1-i0) + i0 += max_passage - overlap + else: + all_ids.append((ids[i], None)) + all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]}) + all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len) + + + total_batches = (len(all_tokens) + batch_size - 1) // batch_size + + all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0])) + + + for i in range(0, len(all_tokens), batch_size): + batch_num = i // batch_size + 1 + if batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_tokens = all_tokens[i:i + batch_size] + + batch_max = max([len(ids['input_ids']) for ids in batch_tokens]) + encoded = dict() + encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device) + encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device) + + + + try: + + with torch.no_grad(): + model_output = self.model(**encoded) + + # E5 uses mean pooling with attention mask + attention_mask = encoded['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + + # L2 normalize embeddings (important for E5) + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for j in range(len(encoded)): + try: + + encoded0 = encoded[j:j+1] + + with torch.no_grad(): + model_output = self.model(**encoded0) + attention_mask = encoded0['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded0, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + # E5-base has 768 dimensions. Large 1024 + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + id2idx = {} + for i in range(len(all_ids)): + if all_ids[i][0] not in id2idx: + id2idx[all_ids[i][0]] = [] + id2idx[all_ids[i][0]].append(i) + + return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy() + + + + +class SnowflakeInstructRetriever: + def __init__(self, model_name=None, device=None): + """ + Initializes the retriever using the multilingual snowflake-arctic-embed-l-v2.0 model. + """ + # Use local model + if model_name is None: + # local_model_path = os.path.join('sub/models', 'snowflake-arctic-embed-l-v2.0') + local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'snowflake-arctic-embed-l-v2.0') + if os.path.isdir(local_model_path): + model_name = local_model_path + print(f"Using local snowflake-arctic-embed-l-v2.0 model from: {model_name}") + + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + + # Clear GPU cache before loading model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + print(f"Loading snowflake-arctic-embed-l-v2.0 multilingual model on device: {self.device}") + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device) + self.model.eval() + + # Clear cache after model loading + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + self.corpus_ids = [] + self.corpus_embeddings = None + + def embed_texts(self, texts, is_query=False, batch_size=32): + """ + Generates embeddings for texts using snowflake-arctic-embed-l-v2.0 model with proper prefixes. + snowflake-arctic-embed-l-v2.0 requires specific prefixes for queries vs passages. + """ + + # E5 model requires specific prefixes + if is_query: + # Add query prefix for E5 + prefixed_texts = [f"query: {text.strip()}" for text in texts] + else: + # Add passage prefix for E5 + prefixed_texts = [f"{text.strip()}" for text in texts] + + all_embeddings = [] + total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size + + for i in range(0, len(prefixed_texts), batch_size): + batch_num = i // batch_size + 1 + if not is_query and batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_texts = prefixed_texts[i:i + batch_size] + + try: + encoded = self.tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=1024, # 512 + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + + embeddings = model_output[0][:, 0].float() + # L2 normalize embeddings + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for single_text in batch_texts: + try: + encoded = self.tokenizer( + [single_text], + padding=True, + truncation=True, + max_length=512, + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + + embeddings = model_output[0][:, 0].float() + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + return torch.cat(all_embeddings, dim=0).numpy() + + def prepare_corpus(self, texts, ids, batch_size=32): + prefixed_texts = [f"{text.strip()}" for text in texts] + + all_embeddings = [] + + all_tokens = [] + all_ids = [] + all_len = [] + + prefix_len = 1 + suffix_len = 1 + max_length = 1024 # 512 + max_passage = max_length - prefix_len - suffix_len + overlap = max_passage // 2 + + for i in range(len(prefixed_texts)): + encoded = self.tokenizer( + [prefixed_texts[i]], + padding=False, + truncation=False, + # max_length=512, + # return_tensors='pt' + ) + + if len(encoded['input_ids'][0]) > max_length: + _idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len)) + + i0 = 0 + while i0 < len(_idxs) - overlap: + i1 = min(i0+max_passage, len(_idxs)) + all_ids.append((ids[i], (i0, i1, len(_idxs)))) + all_tokens.append({ + 'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:], + 'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:] + }) + all_len.append(i1-i0) + i0 += max_passage - overlap + else: + all_ids.append((ids[i], None)) + all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]}) + all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len) + + + total_batches = (len(all_tokens) + batch_size - 1) // batch_size + + all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0])) + + + for i in range(0, len(all_tokens), batch_size): + batch_num = i // batch_size + 1 + if batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_tokens = all_tokens[i:i + batch_size] + + batch_max = max([len(ids['input_ids']) for ids in batch_tokens]) + encoded = dict() + encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device) + encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device) + + + + try: + + with torch.no_grad(): + model_output = self.model(**encoded) + + embeddings = model_output[0][:, 0].float() + # L2 normalize embeddings + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for j in range(len(encoded)): + try: + + encoded0 = encoded[j:j+1] + + with torch.no_grad(): + model_output = self.model(**encoded0) + + embeddings = model_output[0][:, 0].float() + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded0, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + id2idx = {} + for i in range(len(all_ids)): + if all_ids[i][0] not in id2idx: + id2idx[all_ids[i][0]] = [] + id2idx[all_ids[i][0]].append(i) + + return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy() + + + +class SolonRetriever: + def __init__(self, model_name=None, device=None): + """ + Initializes the retriever using the multilingual Solon-embeddings-large-0.1 model. + """ + # Use local model + if model_name is None: + # local_model_path = os.path.join('sub/models', 'Solon-embeddings-large-0.1') + local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'Solon-embeddings-large-0.1') + if os.path.isdir(local_model_path): + model_name = local_model_path + print(f"Using local Solon-embeddings-large-0.1 model from: {model_name}") + + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + + # Clear GPU cache before loading model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + print(f"Loading Solon-embeddings-large-0.1 multilingual model on device: {self.device}") + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device) # torch.float16 + self.model.eval() + + # Clear cache after model loading + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + self.corpus_ids = [] + self.corpus_embeddings = None + + def embed_texts(self, texts, is_query=False, batch_size=32): + """ + Generates embeddings for texts using Solon-embeddings-large-0.1 model with proper prefixes. + Solon-embeddings-large-0.1 requires specific prefixes for queries vs passages. + """ + + # E5 model requires specific prefixes + if is_query: + # Add query prefix + prefixed_texts = [f"query : {text.strip()}" for text in texts] + else: + prefixed_texts = [f"{text.strip()}" for text in texts] + + all_embeddings = [] + total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size + + for i in range(0, len(prefixed_texts), batch_size): + batch_num = i // batch_size + 1 + if not is_query and batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_texts = prefixed_texts[i:i + batch_size] + + try: + encoded = self.tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=512, # 512 + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + + # E5 uses mean pooling with attention mask + attention_mask = encoded['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + + # embeddings = model_output[0][:, 0].float() + # L2 normalize embeddings (important for E5) + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for single_text in batch_texts: + try: + encoded = self.tokenizer( + [single_text], + padding=True, + truncation=True, + max_length=512, + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + attention_mask = encoded['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + + # embeddings = model_output[0][:, 0].float() + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + return torch.cat(all_embeddings, dim=0).numpy() + + def prepare_corpus(self, texts, ids, batch_size=32): + prefixed_texts = [f"{text.strip()}" for text in texts] + + all_embeddings = [] + + all_tokens = [] + all_ids = [] + all_len = [] + + prefix_len = 1 + suffix_len = 1 + max_length = 512 # 1024 # 512 + max_passage = max_length - prefix_len - suffix_len + overlap = max_passage // 2 + + for i in range(len(prefixed_texts)): # tqdm( + encoded = self.tokenizer( + [prefixed_texts[i]], + padding=False, + truncation=False, + # max_length=512, + # return_tensors='pt' + ) + + if len(encoded['input_ids'][0]) > max_length: + _idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len)) + + i0 = 0 + while i0 < len(_idxs) - overlap: + i1 = min(i0+max_passage, len(_idxs)) + all_ids.append((ids[i], (i0, i1, len(_idxs)))) + all_tokens.append({ + 'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:], + 'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:] + }) + all_len.append(i1-i0) + i0 += max_passage - overlap + else: + all_ids.append((ids[i], None)) + all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]}) + all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len) + + + total_batches = (len(all_tokens) + batch_size - 1) // batch_size + + all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0])) + + + for i in range(0, len(all_tokens), batch_size): + batch_num = i // batch_size + 1 + if batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_tokens = all_tokens[i:i + batch_size] + + batch_max = max([len(ids['input_ids']) for ids in batch_tokens]) + encoded = dict() + encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device) + encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device) + + + + try: + + with torch.no_grad(): + model_output = self.model(**encoded) + + # E5 uses mean pooling with attention mask + attention_mask = encoded['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + + # L2 normalize embeddings + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for j in range(len(encoded)): + try: + + encoded0 = encoded[j:j+1] + + with torch.no_grad(): + model_output = self.model(**encoded0) + attention_mask = encoded0['attention_mask'] + embeddings = model_output.last_hidden_state + + # Mean pooling + mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float() + sum_embeddings = torch.sum(embeddings * mask_expanded, 1) + sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) + embeddings = sum_embeddings / sum_mask + + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded0, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + id2idx = {} + for i in range(len(all_ids)): + if all_ids[i][0] not in id2idx: + id2idx[all_ids[i][0]] = [] + id2idx[all_ids[i][0]].append(i) + + return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy() + + +class M3Retriever: + def __init__(self, model_name=None, device=None): + """ + Initializes the bge-m3 retriever using the multilingual bge-m3 model. + """ + # Use local model + if model_name is None: + # local_model_path = os.path.join('sub/models', 'bge-m3') + local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'bge-m3') + if os.path.isdir(local_model_path): + model_name = local_model_path + print(f"Using local bge-m3 model from: {model_name}") + + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + + # Clear GPU cache before loading model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + print(f"Loading bge-m3 multilingual model on device: {self.device}") + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(self.device) # torch.float16 + self.model.eval() + + # Clear cache after model loading + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + self.corpus_ids = [] + self.corpus_embeddings = None + + def embed_texts(self, texts, is_query=False, batch_size=32): + """ + Generates embeddings for texts using E5 Instruct model with proper prefixes. + bge-m3 requires specific prefixes for queries vs passages. + """ + + prefixed_texts = [f"{text.strip()}" for text in texts] + + all_embeddings = [] + total_batches = (len(prefixed_texts) + batch_size - 1) // batch_size + + for i in range(0, len(prefixed_texts), batch_size): + batch_num = i // batch_size + 1 + if not is_query and batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_texts = prefixed_texts[i:i + batch_size] + + try: + encoded = self.tokenizer( + batch_texts, + padding=True, + truncation=True, + max_length=512, + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + + embeddings = model_output[0][:, 0].float() + + # L2 normalize embeddings + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for single_text in batch_texts: + try: + encoded = self.tokenizer( + [single_text], + padding=True, + truncation=True, + max_length=512, + return_tensors='pt' + ).to(self.device) + + with torch.no_grad(): + model_output = self.model(**encoded) + + embeddings = model_output[0][:, 0].float() + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + return torch.cat(all_embeddings, dim=0).numpy() + + def prepare_corpus(self, texts, ids, batch_size=32): + prefixed_texts = [f"{text.strip()}" for text in texts] + + all_embeddings = [] + + all_tokens = [] + all_ids = [] + all_len = [] + + prefix_len = 1 + suffix_len = 1 + max_length = 512 + max_passage = max_length - prefix_len - suffix_len + overlap = max_passage // 2 + + for i in range(len(prefixed_texts)): # tqdm( + encoded = self.tokenizer( + [prefixed_texts[i]], + padding=False, + truncation=False, + # max_length=512, + # return_tensors='pt' + ) + + if len(encoded['input_ids'][0]) > max_length: + _idxs = list(range(prefix_len, len(encoded['input_ids'][0])-suffix_len)) + + i0 = 0 + while i0 < len(_idxs) - overlap: + i1 = min(i0+max_passage, len(_idxs)) + all_ids.append((ids[i], (i0, i1, len(_idxs)))) + all_tokens.append({ + 'input_ids': encoded['input_ids'][0][:prefix_len] + encoded['input_ids'][0][i0+prefix_len:i1+prefix_len] + encoded['input_ids'][0][-suffix_len:], + 'attention_mask': encoded['attention_mask'][0][:prefix_len] + encoded['attention_mask'][0][i0+prefix_len:i1+prefix_len] + encoded['attention_mask'][0][-suffix_len:] + }) + all_len.append(i1-i0) + i0 += max_passage - overlap + else: + all_ids.append((ids[i], None)) + all_tokens.append({'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]}) + all_len.append(len(encoded['input_ids'][0]) - prefix_len - suffix_len) + + + total_batches = (len(all_tokens) + batch_size - 1) // batch_size + + all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0])) + + + for i in range(0, len(all_tokens), batch_size): + batch_num = i // batch_size + 1 + if batch_num % 50 == 0: + print(f"Processing batch {batch_num}/{total_batches} ({(batch_num/total_batches)*100:.1f}%)") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + batch_tokens = all_tokens[i:i + batch_size] + + batch_max = max([len(ids['input_ids']) for ids in batch_tokens]) + encoded = dict() + encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device) + encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device) + + + + try: + + with torch.no_grad(): + model_output = self.model(**encoded) + + embeddings = model_output[0][:, 0].float() + + # L2 normalize embeddings (important for E5) + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # Move to CPU immediately + all_embeddings.append(embeddings.cpu()) + + # Clear GPU memory + del encoded, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except torch.cuda.OutOfMemoryError as e: + print(f"CUDA OOM at batch {batch_num}, reducing batch size...") + # Process one item at a time + for j in range(len(encoded)): + try: + + encoded0 = encoded[j:j+1] + + with torch.no_grad(): + model_output = self.model(**encoded0) + + embeddings = model_output[0][:, 0].float() + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + all_embeddings.append(embeddings.cpu()) + + del encoded0, model_output, embeddings + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e2: + print(f"Failed to process single text: {e2}") + zero_embedding = torch.zeros(1, 1024).float() + all_embeddings.append(zero_embedding) + + id2idx = {} + for i in range(len(all_ids)): + if all_ids[i][0] not in id2idx: + id2idx[all_ids[i][0]] = [] + id2idx[all_ids[i][0]].append(i) + + return torch.cat(all_embeddings, dim=0).to(self.model.device), all_ids, id2idx # numpy() + + + +class BGEReranker: + def __init__(self, model_name=None, device=None): + """ + Initializes the BGE reranker for fine-grained relevance scoring. + """ + # Use local model + if model_name is None: + # local_model_path = os.path.join('sub/models', 'bge-reranker-v2-m3') # 'bge-reranker-v2-m3' + local_model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'bge-reranker-v2-m3_pseudo_tune_full') # 'bge-reranker-v2-m3' + if os.path.isdir(local_model_path): + model_name = local_model_path + print(f"Using local BGE model from: {model_name}") + + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + + print(f"Loading BGE reranker on device: {self.device}") + + # BGE reranker is actually a special model type + from transformers import AutoModelForSequenceClassification + + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModelForSequenceClassification.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + trust_remote_code=True + ).to(self.device) + self.model.eval() + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def prepare_corpus(self, texts, ids): + + self.corpus_tokens = {} + for i in range(len(texts)): # + encoded = self.tokenizer( + [texts[i].strip()], + padding=False, + truncation=False, + # max_length=512, + # return_tensors='pt' + ) + self.corpus_tokens[ids[i]] = {'input_ids': encoded['input_ids'][0], 'attention_mask': encoded['attention_mask'][0]} + + + def rerank(self, query_text, passages, passage_ids, max_time, top_k=20): + """ + Rerank the passages using BGE reranker - CORRECTED VERSION. + """ + if not passages: + return [] + if time.time() > max_time-0.1: + print(f'No time for bge rerank!!') + return [] + + max_length = 2048 + + query_encoded = self.tokenizer( + [query_text.strip()], + padding=False, + truncation=True, + max_length = max_length - 1024, + ) + + all_tokens = [] + all_ids = [] + all_len = [] + + suffix_len = 1 + prefix_len = 1 + + max_passage = max_length - len(query_encoded['input_ids'][0]) - suffix_len - prefix_len + overlap = max_passage // 2 + + for i in range(len(passage_ids)): # tqdm( + encoded = self.corpus_tokens[passage_ids[i]] + + if len(encoded['input_ids']) - suffix_len - prefix_len > max_passage: + _idxs = list(range(prefix_len, len(encoded['input_ids'])-suffix_len)) + + i0 = 0 + while i0 < len(_idxs) - overlap: + i1 = min(i0+max_passage, len(_idxs)) + all_ids.append((passage_ids[i], (i0, i1, len(_idxs)))) + all_tokens.append({ + 'input_ids': query_encoded['input_ids'][0] + [2] + encoded['input_ids'][i0+prefix_len:i1+prefix_len] + [2], + 'attention_mask': query_encoded['attention_mask'][0] + [1] + encoded['attention_mask'][i0+prefix_len:i1+prefix_len] + [1] + }) + all_len.append(i1-i0) + i0 += max_passage - overlap + else: + all_ids.append((passage_ids[i], None)) + all_tokens.append({'input_ids': query_encoded['input_ids'][0] + [2] + encoded['input_ids'][1:], 'attention_mask': query_encoded['attention_mask'][0] + [1] + encoded['attention_mask'][1:]}) + all_len.append(len(encoded['input_ids']) - prefix_len - suffix_len) + + + # all_len, all_tokens, all_ids = zip(*sorted(zip(all_len, all_tokens, all_ids), key=lambda x: x[0])) + + + scores = [] + batch_size = 4 # 4 # Conservative batch size + + for i in range(0, len(all_tokens), batch_size): + if time.time() > max_time: + print(f'bge rerank time limit! processed {len(scores)} of {len(all_tokens)}') + break + + batch_tokens = all_tokens[i:i + batch_size] + + batch_max = max([len(ids['input_ids']) for ids in batch_tokens]) + encoded = dict() + encoded["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in batch_tokens] + encoded["attention_mask"] = torch.tensor(encoded["attention_mask"], dtype=torch.long).to(self.device) + encoded["input_ids"] = torch.tensor(encoded["input_ids"], dtype=torch.long).to(self.device) + + + try: + # BGE reranker expects SEPARATE query and passage inputs + # NOT concatenated strings + # batch_queries = [query_text] * len(batch_passages) + + # Tokenize query-passage pairs properly + with torch.no_grad(): + + # Get relevance scores from sequence classification model + outputs = self.model(**encoded) + + # BGE reranker outputs logits for relevance classification + logits = outputs.logits.float() + + # Handle different output shapes + if len(logits.shape) == 1: + # Single score per pair + batch_scores = logits.cpu().numpy() + elif logits.shape[1] == 1: + # Single column output + batch_scores = logits.squeeze(-1).cpu().numpy() + else: + # Binary classification - take positive class (index 1) + batch_scores = logits[:, 1].cpu().numpy() + + scores.extend(batch_scores.tolist()) + + # Cleanup + del encoded, outputs + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + except Exception as e: + print(f"Error in reranking batch {i//batch_size + 1}: {e}") + # Fallback: Use neutral scores for this batch + fallback_scores = [0.5] * len(batch_tokens) + scores.extend(fallback_scores) + + all_ids = all_ids[:len(scores)] + + # Combine results and sort by reranking score + results = list(zip(all_ids, scores)) + results.sort(key=lambda x: x[1], reverse=True) + + new_res = [] + _used = set([]) + for i in range(len(results)): + if results[i][0][0] not in _used: + _used.add(results[i][0][0]) + new_res.append((results[i][0][0], results[i][1])) + + return new_res[:min(top_k, len(new_res))] + + + + + + + +# Global instances +retrievers = None +reranker = None +retrieverBM5 = None +corpus_texts = {} # Store original passage texts for reranking + +def preprocess(corpus_dict): + """ + Preprocessing function using E5 multilingual model + BGE reranker. + + Input: corpus_dict - dict mapping document IDs to document objects with 'passage'/'text' field + Output: dict containing initialized models, embeddings, and corpus data + + Note: Uses global variables (retriever, reranker, corpus_texts) for efficiency, + but also returns all required data via preprocessed_data for function interface. + """ + global retrievers, reranker, corpus_texts, retrieverBM5 + + start_time = time.time() + + print("=" * 60) + print("PREPROCESSING: Initializing E5 + BGE Reranker Pipeline...") + print("=" * 60) + + # Set GPU memory optimization + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + # Initialize E5 retriever + print("Loading retrievers...") + + retrieverE5 = E5Retriever() + + retrieverE5Ins = E5InstructRetriever() + + retrieverM3 = M3Retriever() + + retrieverSnowflake = SnowflakeInstructRetriever() + + retrieverSolon = SolonRetriever() + + retrieverRAGbot = E5Retriever(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models', 'Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0') ) + + retrievers = [retrieverE5, retrieverE5Ins, retrieverM3, retrieverSnowflake, retrieverSolon, retrieverRAGbot] + + + # Initialize BGE reranker + print("Loading rerankers...") + reranker = BGEReranker() + + print(f"Preparing corpus with {len(corpus_dict)} documents...") + + # Store corpus IDs, passages, and original texts + corpus_ids = list(corpus_dict.keys()) + passages = [doc.get('passage', doc.get('text', '')) for doc in corpus_dict.values()] + + for ret in retrievers: + ret.corpus_ids = list(corpus_dict.keys()) + print("Computing embeddings...") + ret.corpus_embeddings, ret.corpus_ids, ret.id2idx = ret.prepare_corpus(passages, ret.corpus_ids, batch_size=32) + + print("✓ Corpus preprocessing complete!") + print(f"✓ Generated embeddings for {len(ret.corpus_ids)} documents") + print(f"✓ Embedding matrix shape: {ret.corpus_embeddings.shape}") + + # Store original texts for reranking + corpus_texts = {doc_id: passages[i] for i, doc_id in enumerate(corpus_ids)} + + reranker.corpus_ids = list(corpus_dict.keys()) + reranker.prepare_corpus(passages, reranker.corpus_ids) + + retrieverBM5 = BM25sRetriever(passages, corpus_ids) + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + end_time = time.time() + elapsed_time = end_time - start_time + print(f"Elapsed time: {elapsed_time} seconds") + + return { + 'retrievers': retrievers, + 'retrieverBM5': retrieverBM5, + 'reranker': reranker, + 'corpus_ids': corpus_ids, + 'corpus_texts': corpus_texts, + 'num_documents': len(corpus_dict) + } + + +def predict(query, preprocessed_data): + """ + Two-stage prediction: E5 retrieval + BGE reranking. + + Input: + - query: dict with 'query' field containing query text + - preprocessed_data: dict from preprocess() containing models and corpus data + + Output: list of dicts with 'paragraph_uuid' and 'score' fields, ranked by relevance + + Note: Uses global variables for efficiency but can also extract required data + from preprocessed_data parameter for proper function interface. + """ + global retrievers, reranker, corpus_texts, retrieverBM5 + + start_time = time.time() + max_time = start_time + 1.85 # + + # Extract query text + query_text = query.get('query', '') + if not query_text: + return [] + + # Use global instances or get from preprocessed_data + if retrievers is None: + retrievers = preprocessed_data.get('retrievers') + reranker = preprocessed_data.get('reranker') + corpus_texts = preprocessed_data.get('corpus_texts', {}) + retrieverBM5 = preprocessed_data.get('retrieverBM5') + + if retrievers is None or reranker is None: + print("Error: Missing retriever or reranker in preprocessed data") + return [] + + TopNCandidates0 = 250 + TopNCandidates = 250 + + retriever_mean_std = [(-1.4131863134104607, 1.055066495990117), + (0.8814187049865723, 0.017973395064473152), + (0.5294753313064575, 0.06463246047496796), + (0.4171469509601593, 0.0699099600315094), + (0.49514809250831604, 0.06277099251747131), + (0.530211865901947, 0.0704670324921608)] + + + retriever_w = np.array([1.1, 0.25, 0.2, 0.3, 0.3, 0.3]) + retriever_w /= retriever_w.sum() + + try: + # STAGE 1: Retrieval (get top 100 candidates) + + candidate_ids = [] + candidate_scores0 = [] + candidate_passages = [] + candidate_scores_bm5 = [] + + ret_scores = [] + for retriever in retrievers: + query_embedding = retriever.embed_texts([query_text], is_query=True, batch_size=1) + query_embedding = torch.from_numpy(query_embedding).cuda() + # Compute cosine similarity with precomputed corpus embeddings + _scores = F.cosine_similarity(query_embedding, retriever.corpus_embeddings, 1, 1e-6) + _scores = _scores.cpu().numpy() + ret_scores.append(_scores) + _argsort = np.argsort(_scores)[::-1] + + for idx in _argsort[:TopNCandidates0]: + if retriever.corpus_ids[idx][0] not in candidate_ids: + candidate_ids.append(retriever.corpus_ids[idx][0]) + + _scores = retrieverBM5.get_scores(query_text) + _argsort = np.argsort(_scores)[::-1] + for idx in _argsort[:TopNCandidates0]: + if retrieverBM5.corpus_ids[idx] not in candidate_ids: + candidate_ids.append(retrieverBM5.corpus_ids[idx]) + + for i in range(len(candidate_ids)): + doc_id = candidate_ids[i] + _bm5_sc = (_scores[retrieverBM5.id2idx[doc_id]] - 5.919949) / 2.7885008 + candidate_scores_bm5.append(_bm5_sc) + _sc = 0 + for j in range(len(retrievers)): + _sc0 = np.max(ret_scores[j][retrievers[j].id2idx[doc_id]]) + _sc0 = (_sc0 - retriever_mean_std[j][0]) / retriever_mean_std[j][1] + _sc += retriever_w[j] * _sc0 + candidate_scores0.append(0.85 * _sc + 0.15 * _bm5_sc) + candidate_passages.append(corpus_texts.get(doc_id, '')) + + candidate_scores, candidate_ids, candidate_passages, candidate_scores_bm5 = zip(*sorted(zip(candidate_scores0, candidate_ids, candidate_passages, candidate_scores_bm5))[::-1][:min(len(candidate_ids), TopNCandidates)]) + + + # STAGE 2: Reranking (rerank top 100 -> top 20) + + reranked_results = reranker.rerank( + query_text, + candidate_passages, + candidate_ids, + max_time, + top_k=len(candidate_ids) + ) + + if len(reranked_results) >= 20: + reranked_scores0 = [rerank_score for (passage_id, rerank_score) in reranked_results] + reranked_results = [passage_id for (passage_id, rerank_score) in reranked_results] + scores = [] + for _i, _p in enumerate(reranked_results): + scores.append(0.35 * (reranked_scores0[_i] + 1.8669906079283858) / 1.207719509974457 + + 0.65 * candidate_scores[candidate_ids.index(_p)]) + + reranked_results = [(rerank_score, passage_id) for rerank_score, passage_id in sorted(zip(scores, reranked_results))][::-1] #[:TopNCandidates2] + else: + reranked_results = [(candidate_scores[i], candidate_ids[i]) for i in range(20)] + + + # Build final results with ACTUAL reranking scores + results = [] + for rank, (rerank_score, passage_id) in enumerate(reranked_results[:20]): + results.append({ + 'paragraph_uuid': passage_id, + 'score': float(rerank_score) # Use actual BGE reranker score! + }) + + end_time = time.time() + elapsed_time = end_time - start_time + # print(f"✓ Returned {len(results)} results with reranker scores. Elapsed time: {elapsed_time} seconds") + + return results + + except Exception as e: + print(f"Error in prediction: {e}") + # Fallback to E5-only retrieval with E5 scores + try: + query_embedding = retrievers[0].embed_texts([query_text], is_query=True, batch_size=1) + query_embedding = torch.from_numpy(query_embedding).cuda() + e5_scores = F.cosine_similarity(query_embedding, retrievers[0].corpus_embeddings, 1, 1e-6) + e5_scores = e5_scores.cpu().numpy() + top_indices = np.argsort(e5_scores)[::-1][:20] + + results = [] + for idx in top_indices: + results.append({ + 'paragraph_uuid': retriever.corpus_ids[idx], + 'score': float(e5_scores[idx]) # Use actual E5 cosine similarity score + }) + + return results + except: + return [] diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/1_Pooling/config.json b/victord/sub19/models/Solon-embeddings-large-0.1/1_Pooling/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f060ee536308b48017dad1a834f306f115695a3 --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 1024, + "pooling_mode_cls_token": false, + "pooling_mode_mean_tokens": true, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": false, + "include_prompt": true +} \ No newline at end of file diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/README.md b/victord/sub19/models/Solon-embeddings-large-0.1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d087525fab8bd7fed42562039945ff1fc3b24b1c --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/README.md @@ -0,0 +1,810 @@ +--- +tags: +- mteb +model-index: +- name: Solon-embeddings-large-0.1 + results: + - task: + type: sentence-similarity + name: Passage Retrieval + dataset: + type: unicamp-dl/mmarco + name: mMARCO-fr + config: french + split: validation + metrics: + - type: recall_at_500 + name: Recall@500 + value: 92.7 + - type: recall_at_100 + name: Recall@100 + value: 82.7 + - type: recall_at_10 + name: Recall@10 + value: 55.5 + - type: map_at_10 + name: MAP@10 + value: 29.4 + - type: ndcg_at_10 + name: nDCG@10 + value: 35.8 + - type: mrr_at_10 + name: MRR@10 + value: 29.9 + - task: + type: Clustering + dataset: + type: lyon-nlp/alloprof + name: MTEB AlloProfClusteringP2P + config: default + split: test + revision: 392ba3f5bcc8c51f578786c1fc3dae648662cb9b + metrics: + - type: v_measure + value: 64.16942168287153 + - task: + type: Clustering + dataset: + type: lyon-nlp/alloprof + name: MTEB AlloProfClusteringS2S + config: default + split: test + revision: 392ba3f5bcc8c51f578786c1fc3dae648662cb9b + metrics: + - type: v_measure + value: 38.17076313383054 + - task: + type: Reranking + dataset: + type: lyon-nlp/mteb-fr-reranking-alloprof-s2p + name: MTEB AlloprofReranking + config: default + split: test + revision: 666fdacebe0291776e86f29345663dfaf80a0db9 + metrics: + - type: map + value: 64.8770878097632 + - type: mrr + value: 66.39132423169396 + - task: + type: Retrieval + dataset: + type: lyon-nlp/alloprof + name: MTEB AlloprofRetrieval + config: default + split: test + revision: 392ba3f5bcc8c51f578786c1fc3dae648662cb9b + metrics: + - type: map_at_1 + value: 29.62 + - type: map_at_10 + value: 40.963 + - type: map_at_100 + value: 41.894 + - type: map_at_1000 + value: 41.939 + - type: map_at_3 + value: 37.708999999999996 + - type: map_at_5 + value: 39.696999999999996 + - type: mrr_at_1 + value: 29.62 + - type: mrr_at_10 + value: 40.963 + - type: mrr_at_100 + value: 41.894 + - type: mrr_at_1000 + value: 41.939 + - type: mrr_at_3 + value: 37.708999999999996 + - type: mrr_at_5 + value: 39.696999999999996 + - type: ndcg_at_1 + value: 29.62 + - type: ndcg_at_10 + value: 46.942 + - type: ndcg_at_100 + value: 51.629999999999995 + - type: ndcg_at_1000 + value: 52.927 + - type: ndcg_at_3 + value: 40.333999999999996 + - type: ndcg_at_5 + value: 43.922 + - type: precision_at_1 + value: 29.62 + - type: precision_at_10 + value: 6.589 + - type: precision_at_100 + value: 0.882 + - type: precision_at_1000 + value: 0.099 + - type: precision_at_3 + value: 15.976 + - type: precision_at_5 + value: 11.33 + - type: recall_at_1 + value: 29.62 + - type: recall_at_10 + value: 65.889 + - type: recall_at_100 + value: 88.212 + - type: recall_at_1000 + value: 98.575 + - type: recall_at_3 + value: 47.927 + - type: recall_at_5 + value: 56.64900000000001 + - task: + type: Classification + dataset: + type: mteb/amazon_reviews_multi + name: MTEB AmazonReviewsClassification (fr) + config: fr + split: test + revision: 1399c76144fd37290681b995c656ef9b2e06e26d + metrics: + - type: accuracy + value: 42.077999999999996 + - type: f1 + value: 40.64511241732637 + - task: + type: Retrieval + dataset: + type: maastrichtlawtech/bsard + name: MTEB BSARDRetrieval + config: default + split: test + revision: 5effa1b9b5fa3b0f9e12523e6e43e5f86a6e6d59 + metrics: + - type: map_at_1 + value: 0.901 + - type: map_at_10 + value: 1.524 + - type: map_at_100 + value: 1.833 + - type: map_at_1000 + value: 1.916 + - type: map_at_3 + value: 1.276 + - type: map_at_5 + value: 1.276 + - type: mrr_at_1 + value: 0.901 + - type: mrr_at_10 + value: 1.524 + - type: mrr_at_100 + value: 1.833 + - type: mrr_at_1000 + value: 1.916 + - type: mrr_at_3 + value: 1.276 + - type: mrr_at_5 + value: 1.276 + - type: ndcg_at_1 + value: 0.901 + - type: ndcg_at_10 + value: 2.085 + - type: ndcg_at_100 + value: 3.805 + - type: ndcg_at_1000 + value: 6.704000000000001 + - type: ndcg_at_3 + value: 1.41 + - type: ndcg_at_5 + value: 1.41 + - type: precision_at_1 + value: 0.901 + - type: precision_at_10 + value: 0.40499999999999997 + - type: precision_at_100 + value: 0.126 + - type: precision_at_1000 + value: 0.037 + - type: precision_at_3 + value: 0.601 + - type: precision_at_5 + value: 0.36 + - type: recall_at_1 + value: 0.901 + - type: recall_at_10 + value: 4.054 + - type: recall_at_100 + value: 12.613 + - type: recall_at_1000 + value: 36.937 + - type: recall_at_3 + value: 1.802 + - type: recall_at_5 + value: 1.802 + - task: + type: BitextMining + dataset: + type: rbawden/DiaBLa + name: MTEB DiaBLaBitextMining (fr-en) + config: fr-en + split: test + revision: 5345895c56a601afe1a98519ce3199be60a27dba + metrics: + - type: accuracy + value: 88.90048712595686 + - type: f1 + value: 86.94952864886115 + - type: precision + value: 86.20344379175826 + - type: recall + value: 88.90048712595686 + - task: + type: Clustering + dataset: + type: lyon-nlp/clustering-hal-s2s + name: MTEB HALClusteringS2S + config: default + split: test + revision: e06ebbbb123f8144bef1a5d18796f3dec9ae2915 + metrics: + - type: v_measure + value: 24.087988843991155 + - task: + type: Clustering + dataset: + type: mlsum + name: MTEB MLSUMClusteringP2P + config: default + split: test + revision: b5d54f8f3b61ae17845046286940f03c6bc79bc7 + metrics: + - type: v_measure + value: 43.79603865728535 + - task: + type: Clustering + dataset: + type: mlsum + name: MTEB MLSUMClusteringS2S + config: default + split: test + revision: b5d54f8f3b61ae17845046286940f03c6bc79bc7 + metrics: + - type: v_measure + value: 37.746550373003 + - task: + type: Classification + dataset: + type: mteb/mtop_domain + name: MTEB MTOPDomainClassification (fr) + config: fr + split: test + revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf + metrics: + - type: accuracy + value: 89.26088318196052 + - type: f1 + value: 88.95811185929033 + - task: + type: Classification + dataset: + type: mteb/mtop_intent + name: MTEB MTOPIntentClassification (fr) + config: fr + split: test + revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba + metrics: + - type: accuracy + value: 68.55308487316003 + - type: f1 + value: 48.2936682439785 + - task: + type: Classification + dataset: + type: masakhane/masakhanews + name: MTEB MasakhaNEWSClassification (fra) + config: fra + split: test + revision: 8ccc72e69e65f40c70e117d8b3c08306bb788b60 + metrics: + - type: accuracy + value: 81.51658767772511 + - type: f1 + value: 77.695234448912 + - task: + type: Clustering + dataset: + type: masakhane/masakhanews + name: MTEB MasakhaNEWSClusteringP2P (fra) + config: fra + split: test + revision: 8ccc72e69e65f40c70e117d8b3c08306bb788b60 + metrics: + - type: v_measure + value: 40.80377094681114 + - task: + type: Clustering + dataset: + type: masakhane/masakhanews + name: MTEB MasakhaNEWSClusteringS2S (fra) + config: fra + split: test + revision: 8ccc72e69e65f40c70e117d8b3c08306bb788b60 + metrics: + - type: v_measure + value: 28.79703837416241 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (fr) + config: fr + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 67.40080699394755 + - type: f1 + value: 65.60793135686376 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (fr) + config: fr + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 71.29455279085406 + - type: f1 + value: 70.80876673828983 + - task: + type: Retrieval + dataset: + type: jinaai/mintakaqa + name: MTEB MintakaRetrieval (fr) + config: fr + split: test + revision: efa78cc2f74bbcd21eff2261f9e13aebe40b814e + metrics: + - type: map_at_1 + value: 16.625999999999998 + - type: map_at_10 + value: 25.224999999999998 + - type: map_at_100 + value: 26.291999999999998 + - type: map_at_1000 + value: 26.395000000000003 + - type: map_at_3 + value: 22.378999999999998 + - type: map_at_5 + value: 24.009 + - type: mrr_at_1 + value: 16.625999999999998 + - type: mrr_at_10 + value: 25.224999999999998 + - type: mrr_at_100 + value: 26.291999999999998 + - type: mrr_at_1000 + value: 26.395000000000003 + - type: mrr_at_3 + value: 22.378999999999998 + - type: mrr_at_5 + value: 24.009 + - type: ndcg_at_1 + value: 16.625999999999998 + - type: ndcg_at_10 + value: 30.074 + - type: ndcg_at_100 + value: 35.683 + - type: ndcg_at_1000 + value: 38.714999999999996 + - type: ndcg_at_3 + value: 24.188000000000002 + - type: ndcg_at_5 + value: 27.124 + - type: precision_at_1 + value: 16.625999999999998 + - type: precision_at_10 + value: 4.566 + - type: precision_at_100 + value: 0.729 + - type: precision_at_1000 + value: 0.097 + - type: precision_at_3 + value: 9.801 + - type: precision_at_5 + value: 7.305000000000001 + - type: recall_at_1 + value: 16.625999999999998 + - type: recall_at_10 + value: 45.659 + - type: recall_at_100 + value: 72.85000000000001 + - type: recall_at_1000 + value: 97.42 + - type: recall_at_3 + value: 29.402 + - type: recall_at_5 + value: 36.527 + - task: + type: PairClassification + dataset: + type: GEM/opusparcus + name: MTEB OpusparcusPC (fr) + config: fr + split: test + revision: 9e9b1f8ef51616073f47f306f7f47dd91663f86a + metrics: + - type: cos_sim_accuracy + value: 83.58310626702998 + - type: cos_sim_ap + value: 94.01979957812989 + - type: cos_sim_f1 + value: 88.70135958743555 + - type: cos_sim_precision + value: 84.01420959147424 + - type: cos_sim_recall + value: 93.94240317775571 + - type: dot_accuracy + value: 83.58310626702998 + - type: dot_ap + value: 94.01979957812989 + - type: dot_f1 + value: 88.70135958743555 + - type: dot_precision + value: 84.01420959147424 + - type: dot_recall + value: 93.94240317775571 + - type: euclidean_accuracy + value: 83.58310626702998 + - type: euclidean_ap + value: 94.01979957812989 + - type: euclidean_f1 + value: 88.70135958743555 + - type: euclidean_precision + value: 84.01420959147424 + - type: euclidean_recall + value: 93.94240317775571 + - type: manhattan_accuracy + value: 83.58310626702998 + - type: manhattan_ap + value: 93.99936024003892 + - type: manhattan_f1 + value: 88.6924150767799 + - type: manhattan_precision + value: 83.45008756567425 + - type: manhattan_recall + value: 94.63753723932473 + - type: max_accuracy + value: 83.58310626702998 + - type: max_ap + value: 94.01979957812989 + - type: max_f1 + value: 88.70135958743555 + - task: + type: PairClassification + dataset: + type: paws-x + name: MTEB PawsX (fr) + config: fr + split: test + revision: 8a04d940a42cd40658986fdd8e3da561533a3646 + metrics: + - type: cos_sim_accuracy + value: 60.6 + - type: cos_sim_ap + value: 60.18915797975459 + - type: cos_sim_f1 + value: 62.491349480968864 + - type: cos_sim_precision + value: 45.44539506794162 + - type: cos_sim_recall + value: 100 + - type: dot_accuracy + value: 60.6 + - type: dot_ap + value: 60.091135216056024 + - type: dot_f1 + value: 62.491349480968864 + - type: dot_precision + value: 45.44539506794162 + - type: dot_recall + value: 100 + - type: euclidean_accuracy + value: 60.6 + - type: euclidean_ap + value: 60.18915797975459 + - type: euclidean_f1 + value: 62.491349480968864 + - type: euclidean_precision + value: 45.44539506794162 + - type: euclidean_recall + value: 100 + - type: manhattan_accuracy + value: 60.650000000000006 + - type: manhattan_ap + value: 60.2082343915352 + - type: manhattan_f1 + value: 62.491349480968864 + - type: manhattan_precision + value: 45.44539506794162 + - type: manhattan_recall + value: 100 + - type: max_accuracy + value: 60.650000000000006 + - type: max_ap + value: 60.2082343915352 + - type: max_f1 + value: 62.491349480968864 + - task: + type: STS + dataset: + type: Lajavaness/SICK-fr + name: MTEB SICKFr + config: default + split: test + revision: e077ab4cf4774a1e36d86d593b150422fafd8e8a + metrics: + - type: cos_sim_pearson + value: 79.77067200230256 + - type: cos_sim_spearman + value: 76.7445532523278 + - type: euclidean_pearson + value: 76.34017074673956 + - type: euclidean_spearman + value: 76.7453011027832 + - type: manhattan_pearson + value: 76.19578084197778 + - type: manhattan_spearman + value: 76.56293456459228 + - task: + type: STS + dataset: + type: mteb/sts22-crosslingual-sts + name: MTEB STS22 (fr) + config: fr + split: test + revision: eea2b4fe26a775864c896887d910b76a8098ad3f + metrics: + - type: cos_sim_pearson + value: 81.2564160237984 + - type: cos_sim_spearman + value: 83.30552085410882 + - type: euclidean_pearson + value: 82.00494560507786 + - type: euclidean_spearman + value: 83.30552085410882 + - type: manhattan_pearson + value: 81.93132229157803 + - type: manhattan_spearman + value: 83.04357992939353 + - task: + type: STS + dataset: + type: stsb_multi_mt + name: MTEB STSBenchmarkMultilingualSTS (fr) + config: fr + split: test + revision: 93d57ef91790589e3ce9c365164337a8a78b7632 + metrics: + - type: cos_sim_pearson + value: 80.34931905288978 + - type: cos_sim_spearman + value: 79.99372771100049 + - type: euclidean_pearson + value: 78.37976845123443 + - type: euclidean_spearman + value: 79.99452356550658 + - type: manhattan_pearson + value: 78.24434042082316 + - type: manhattan_spearman + value: 79.87248340061164 + - task: + type: Summarization + dataset: + type: lyon-nlp/summarization-summeval-fr-p2p + name: MTEB SummEvalFr + config: default + split: test + revision: b385812de6a9577b6f4d0f88c6a6e35395a94054 + metrics: + - type: cos_sim_pearson + value: 30.476001473421586 + - type: cos_sim_spearman + value: 29.687350195905456 + - type: dot_pearson + value: 30.476000875190685 + - type: dot_spearman + value: 29.662224660056562 + - task: + type: Reranking + dataset: + type: lyon-nlp/mteb-fr-reranking-syntec-s2p + name: MTEB SyntecReranking + config: default + split: test + revision: b205c5084a0934ce8af14338bf03feb19499c84d + metrics: + - type: map + value: 88.28333333333333 + - type: mrr + value: 88.28333333333333 + - task: + type: Retrieval + dataset: + type: lyon-nlp/mteb-fr-retrieval-syntec-s2p + name: MTEB SyntecRetrieval + config: default + split: test + revision: 77f7e271bf4a92b24fce5119f3486b583ca016ff + metrics: + - type: map_at_1 + value: 69 + - type: map_at_10 + value: 79.906 + - type: map_at_100 + value: 79.982 + - type: map_at_1000 + value: 79.982 + - type: map_at_3 + value: 77.667 + - type: map_at_5 + value: 79.51700000000001 + - type: mrr_at_1 + value: 69 + - type: mrr_at_10 + value: 79.906 + - type: mrr_at_100 + value: 79.982 + - type: mrr_at_1000 + value: 79.982 + - type: mrr_at_3 + value: 77.667 + - type: mrr_at_5 + value: 79.51700000000001 + - type: ndcg_at_1 + value: 69 + - type: ndcg_at_10 + value: 84.60499999999999 + - type: ndcg_at_100 + value: 84.868 + - type: ndcg_at_1000 + value: 84.868 + - type: ndcg_at_3 + value: 80.333 + - type: ndcg_at_5 + value: 83.647 + - type: precision_at_1 + value: 69 + - type: precision_at_10 + value: 9.9 + - type: precision_at_100 + value: 1 + - type: precision_at_1000 + value: 0.1 + - type: precision_at_3 + value: 29.333 + - type: precision_at_5 + value: 19.2 + - type: recall_at_1 + value: 69 + - type: recall_at_10 + value: 99 + - type: recall_at_100 + value: 100 + - type: recall_at_1000 + value: 100 + - type: recall_at_3 + value: 88 + - type: recall_at_5 + value: 96 + - task: + type: Retrieval + dataset: + type: jinaai/xpqa + name: MTEB XPQARetrieval (fr) + config: fr + split: test + revision: c99d599f0a6ab9b85b065da6f9d94f9cf731679f + metrics: + - type: map_at_1 + value: 42.027 + - type: map_at_10 + value: 64.331 + - type: map_at_100 + value: 65.657 + - type: map_at_1000 + value: 65.7 + - type: map_at_3 + value: 57.967999999999996 + - type: map_at_5 + value: 62.33800000000001 + - type: mrr_at_1 + value: 65.688 + - type: mrr_at_10 + value: 72.263 + - type: mrr_at_100 + value: 72.679 + - type: mrr_at_1000 + value: 72.69099999999999 + - type: mrr_at_3 + value: 70.405 + - type: mrr_at_5 + value: 71.587 + - type: ndcg_at_1 + value: 65.688 + - type: ndcg_at_10 + value: 70.221 + - type: ndcg_at_100 + value: 74.457 + - type: ndcg_at_1000 + value: 75.178 + - type: ndcg_at_3 + value: 65.423 + - type: ndcg_at_5 + value: 67.05499999999999 + - type: precision_at_1 + value: 65.688 + - type: precision_at_10 + value: 16.208 + - type: precision_at_100 + value: 1.975 + - type: precision_at_1000 + value: 0.207 + - type: precision_at_3 + value: 39.831 + - type: precision_at_5 + value: 28.652 + - type: recall_at_1 + value: 42.027 + - type: recall_at_10 + value: 78.803 + - type: recall_at_100 + value: 95.051 + - type: recall_at_1000 + value: 99.75500000000001 + - type: recall_at_3 + value: 62.62799999999999 + - type: recall_at_5 + value: 70.975 +license: mit +language: +- fr +--- + +# Solon Embeddings — large 0.1 + +SOTA Open source french embedding model. + +**Instructions :** +Add "query : " before the *query* to retrieve to increase performance of retrieval. +No instructions needed for *passages*. + + +| Model | Mean Score | +| --- | --- | +| **OrdalieTech/Solon-embeddings-large-0.1** | 0.7490 | +| cohere/embed-multilingual-v3 | 0.7402 | +| **OrdalieTech/Solon-embeddings-base-0.1** | 0.7306 | +| openai/ada-002 | 0.7290 | +| cohere/embed-multilingual-light-v3 | 0.6945 | +| antoinelouis/biencoder-camembert-base-mmarcoFR | 0.6826 | +| dangvantuan/sentence-camembert-large | 0.6756 | +| voyage/voyage-01 | 0.6753 | +| intfloat/multilingual-e5-large | 0.6660 | +| intfloat/multilingual-e5-base | 0.6597 | +| Sbert/paraphrase-multilingual-mpnet-base-v2 | 0.5975 | +| dangvantuan/sentence-camembert-base | 0.5456 | +| EuropeanParliament/eubert_embedding_v1 | 0.5063 | + +These results have been obtained through 9 french benchmarks on a variety of text similarity tasks (classification, reranking, STS) : +- AmazonReviewsClassification (MTEB) +- MassiveIntentClassification (MTEB) +- MassiveScenarioClassification (MTEB) +- MTOPDomainClassification (MTEB) +- MTOPIntentClassification (MTEB) +- STS22 (MTEB) +- MiraclFRRerank (Miracl) +- OrdalieFRSTS (Ordalie) +- OrdalieFRReranking (Ordalie) + +We created OrdalieFRSTS and OrdalieFRReranking to enhance the benchmarking capabilities of French STS and reranking assessments. + +(evaluation script available here : github.com/OrdalieTech/mteb) \ No newline at end of file diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/config.json b/victord/sub19/models/Solon-embeddings-large-0.1/config.json new file mode 100644 index 0000000000000000000000000000000000000000..97a8add06d50588ae7f902808c2101e4fe5adedd --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/config.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "XLMRobertaModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "classifier_dropout": null, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "xlm-roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/config_sentence_transformers.json b/victord/sub19/models/Solon-embeddings-large-0.1/config_sentence_transformers.json new file mode 100644 index 0000000000000000000000000000000000000000..aaa499896f85c8e2ba321476fd1479e476145577 --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "__version__": { + "sentence_transformers": "5.1.1", + "transformers": "4.53.2", + "pytorch": "2.9.0+cu128" + }, + "model_type": "SentenceTransformer", + "prompts": { + "query": "", + "document": "" + }, + "default_prompt_name": null, + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/model.safetensors b/victord/sub19/models/Solon-embeddings-large-0.1/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b885133c49312d917bb1aada5ed4b1bd7842fe39 --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93f3e71f01d33409b21e3c4ff37d249cb3c27c883b24a90afd352fb95cbd67fe +size 1119826072 diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/modules.json b/victord/sub19/models/Solon-embeddings-large-0.1/modules.json new file mode 100644 index 0000000000000000000000000000000000000000..952a9b81c0bfd99800fabf352f69c7ccd46c5e43 --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/sentence_bert_config.json b/victord/sub19/models/Solon-embeddings-large-0.1/sentence_bert_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4eca68d85ecd3034cf4174d8a4033a75344ea62d --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 512, + "do_lower_case": false +} \ No newline at end of file diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/sentencepiece.bpe.model b/victord/sub19/models/Solon-embeddings-large-0.1/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/special_tokens_map.json b/victord/sub19/models/Solon-embeddings-large-0.1/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b1879d702821e753ffe4245048eee415d54a9385 --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/tokenizer.json b/victord/sub19/models/Solon-embeddings-large-0.1/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a8c21d8691f041c9859f90256299e3f31e556d5b --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:249df0778f236f6ece390de0de746838ef25b9d6954b68c2ee71249e0a9d8fd4 +size 17082799 diff --git a/victord/sub19/models/Solon-embeddings-large-0.1/tokenizer_config.json b/victord/sub19/models/Solon-embeddings-large-0.1/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f748729626d19d32e1089159feee8a22ccbf0959 --- /dev/null +++ b/victord/sub19/models/Solon-embeddings-large-0.1/tokenizer_config.json @@ -0,0 +1,56 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "" +} diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/1_Pooling/config.json b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/1_Pooling/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f060ee536308b48017dad1a834f306f115695a3 --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 1024, + "pooling_mode_cls_token": false, + "pooling_mode_mean_tokens": true, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": false, + "include_prompt": true +} \ No newline at end of file diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/README.md b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f29422453f386e999cda02555899de5afb458178 --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/README.md @@ -0,0 +1,93 @@ +--- +library_name: sentence-transformers +pipeline_tag: sentence-similarity +tags: +- sentence-transformers +- feature-extraction +- sentence-similarity + +--- + +# {MODEL_NAME} + +This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search. + + + +## Usage (Sentence-Transformers) + +Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: + +``` +pip install -U sentence-transformers +``` + +Then you can use the model like this: + +```python +from sentence_transformers import SentenceTransformer +sentences = ["This is an example sentence", "Each sentence is converted"] + +model = SentenceTransformer('{MODEL_NAME}') +embeddings = model.encode(sentences) +print(embeddings) +``` + + + +## Evaluation Results + + + +For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME}) + + +## Training +The model was trained with the parameters: + +**DataLoader**: + +`torch.utils.data.dataloader.DataLoader` of length 874 with parameters: +``` +{'batch_size': None, 'sampler': 'torch.utils.data.sampler.SequentialSampler', 'batch_sampler': 'utils.UniqueQuestionSampler'} +``` + +**Loss**: + +`sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss` with parameters: + ``` + {'scale': 100, 'similarity_fct': 'cos_sim'} + ``` + +Parameters of the fit()-Method: +``` +{ + "epochs": 10, + "evaluation_steps": 0, + "evaluator": "utils.CustomInformationRetrievalEvaluator", + "max_grad_norm": 1, + "optimizer_class": "", + "optimizer_params": { + "lr": 1e-05, + "weight_decay": 0.01 + }, + "scheduler": "WarmupLinear", + "steps_per_epoch": null, + "warmup_steps": 243, + "weight_decay": 0.01 +} +``` + + +## Full Model Architecture +``` +SentenceTransformer( + (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel + (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True}) + (2): Normalize() +) +``` + +## Citing & Authors + + \ No newline at end of file diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config.json b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..97a8add06d50588ae7f902808c2101e4fe5adedd --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "XLMRobertaModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "classifier_dropout": null, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "xlm-roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config_sentence_transformers.json b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config_sentence_transformers.json new file mode 100644 index 0000000000000000000000000000000000000000..b276271623ce8abb79e36b9dfe4b822d88f3bc16 --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "__version__": { + "sentence_transformers": "5.1.2", + "transformers": "4.53.2", + "pytorch": "2.9.0+cu128" + }, + "prompts": { + "query": "", + "document": "" + }, + "default_prompt_name": null, + "model_type": "SentenceTransformer", + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/model.safetensors b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6c5a83aeb6bfd882a2c9d98213bcacb8db0a442a --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c5ecf0a1c5a72fb78c91dfd5b0c7f16145ca5fe2e5c4d3e0870b73fef7f6e99 +size 1119826072 diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/modules.json b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/modules.json new file mode 100644 index 0000000000000000000000000000000000000000..952a9b81c0bfd99800fabf352f69c7ccd46c5e43 --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/sentence_bert_config.json b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/sentence_bert_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4eca68d85ecd3034cf4174d8a4033a75344ea62d --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 512, + "do_lower_case": false +} \ No newline at end of file diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/special_tokens_map.json b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b1879d702821e753ffe4245048eee415d54a9385 --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer.json b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..2a51933f1ccb3cf68d53b877cbfa24734ada642f --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085 +size 17082987 diff --git a/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer_config.json b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd94cdf46ab8c0bada654d8973c84daf3790852b --- /dev/null +++ b/victord/sub19/models/Webiks_Hebrew_RAGbot_KolZchut_QA_Embedder_v1.0/tokenizer_config.json @@ -0,0 +1,62 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "max_length": 512, + "model_max_length": 512, + "pad_to_multiple_of": null, + "pad_token": "", + "pad_token_type_id": 0, + "padding_side": "right", + "sep_token": "", + "stride": 0, + "tokenizer_class": "XLMRobertaTokenizerFast", + "truncation_side": "right", + "truncation_strategy": "longest_first", + "unk_token": "" +} diff --git a/victord/sub19/models/bge-m3/1_Pooling/config.json b/victord/sub19/models/bge-m3/1_Pooling/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0dfd14b551978a38ff975782a03ffb4cadedc0c7 --- /dev/null +++ b/victord/sub19/models/bge-m3/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 1024, + "pooling_mode_cls_token": true, + "pooling_mode_mean_tokens": false, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": false, + "include_prompt": true +} \ No newline at end of file diff --git a/victord/sub19/models/bge-m3/README.md b/victord/sub19/models/bge-m3/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e5a320176edd6ee1cfb256e68ee5ac0004de9447 --- /dev/null +++ b/victord/sub19/models/bge-m3/README.md @@ -0,0 +1,300 @@ +--- +pipeline_tag: sentence-similarity +tags: +- sentence-transformers +- feature-extraction +- sentence-similarity +license: mit +--- + +For more details please refer to our github repo: https://github.com/FlagOpen/FlagEmbedding + +# BGE-M3 ([paper](https://arxiv.org/pdf/2402.03216.pdf), [code](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/BGE_M3)) + +In this project, we introduce BGE-M3, which is distinguished for its versatility in Multi-Functionality, Multi-Linguality, and Multi-Granularity. +- Multi-Functionality: It can simultaneously perform the three common retrieval functionalities of embedding model: dense retrieval, multi-vector retrieval, and sparse retrieval. +- Multi-Linguality: It can support more than 100 working languages. +- Multi-Granularity: It is able to process inputs of different granularities, spanning from short sentences to long documents of up to 8192 tokens. + + + +**Some suggestions for retrieval pipeline in RAG** + +We recommend to use the following pipeline: hybrid retrieval + re-ranking. +- Hybrid retrieval leverages the strengths of various methods, offering higher accuracy and stronger generalization capabilities. +A classic example: using both embedding retrieval and the BM25 algorithm. +Now, you can try to use BGE-M3, which supports both embedding and sparse retrieval. +This allows you to obtain token weights (similar to the BM25) without any additional cost when generate dense embeddings. +To use hybrid retrieval, you can refer to [Vespa](https://github.com/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/mother-of-all-embedding-models-cloud.ipynb +) and [Milvus](https://github.com/milvus-io/pymilvus/blob/master/examples/hello_hybrid_sparse_dense.py). + +- As cross-encoder models, re-ranker demonstrates higher accuracy than bi-encoder embedding model. +Utilizing the re-ranking model (e.g., [bge-reranker](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/reranker), [bge-reranker-v2](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/llm_reranker)) after retrieval can further filter the selected text. + + +## News: +- 2024/7/1: **We update the MIRACL evaluation results of BGE-M3**. To reproduce the new results, you can refer to: [bge-m3_miracl_2cr](https://huggingface.co/datasets/hanhainebula/bge-m3_miracl_2cr). We have also updated our [paper](https://arxiv.org/pdf/2402.03216) on arXiv. +
+ Details + + The previous test results were lower because we mistakenly removed the passages that have the same id as the query from the search results. After correcting this mistake, the overall performance of BGE-M3 on MIRACL is higher than the previous results, but the experimental conclusion remains unchanged. The other results are not affected by this mistake. To reproduce the previous lower results, you need to add the `--remove-query` parameter when using `pyserini.search.faiss` or `pyserini.search.lucene` to search the passages. + +
+- 2024/3/20: **Thanks Milvus team!** Now you can use hybrid retrieval of bge-m3 in Milvus: [pymilvus/examples +/hello_hybrid_sparse_dense.py](https://github.com/milvus-io/pymilvus/blob/master/examples/hello_hybrid_sparse_dense.py). +- 2024/3/8: **Thanks for the [experimental results](https://towardsdatascience.com/openai-vs-open-source-multilingual-embedding-models-e5ccb7c90f05) from @[Yannael](https://huggingface.co/Yannael). In this benchmark, BGE-M3 achieves top performance in both English and other languages, surpassing models such as OpenAI.** +- 2024/3/2: Release unified fine-tuning [example](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/unified_finetune) and [data](https://huggingface.co/datasets/Shitao/bge-m3-data) +- 2024/2/6: We release the [MLDR](https://huggingface.co/datasets/Shitao/MLDR) (a long document retrieval dataset covering 13 languages) and [evaluation pipeline](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MLDR). +- 2024/2/1: **Thanks for the excellent tool from Vespa.** You can easily use multiple modes of BGE-M3 following this [notebook](https://github.com/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/mother-of-all-embedding-models-cloud.ipynb) + + +## Specs + +- Model + +| Model Name | Dimension | Sequence Length | Introduction | +|:----:|:---:|:---:|:---:| +| [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) | 1024 | 8192 | multilingual; unified fine-tuning (dense, sparse, and colbert) from bge-m3-unsupervised| +| [BAAI/bge-m3-unsupervised](https://huggingface.co/BAAI/bge-m3-unsupervised) | 1024 | 8192 | multilingual; contrastive learning from bge-m3-retromae | +| [BAAI/bge-m3-retromae](https://huggingface.co/BAAI/bge-m3-retromae) | -- | 8192 | multilingual; extend the max_length of [xlm-roberta](https://huggingface.co/FacebookAI/xlm-roberta-large) to 8192 and further pretrained via [retromae](https://github.com/staoxiao/RetroMAE)| +| [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5) | 1024 | 512 | English model | +| [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | 768 | 512 | English model | +| [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) | 384 | 512 | English model | + +- Data + +| Dataset | Introduction | +|:----------------------------------------------------------:|:-------------------------------------------------:| +| [MLDR](https://huggingface.co/datasets/Shitao/MLDR) | Docuemtn Retrieval Dataset, covering 13 languages | +| [bge-m3-data](https://huggingface.co/datasets/Shitao/bge-m3-data) | Fine-tuning data used by bge-m3 | + + + +## FAQ + +**1. Introduction for different retrieval methods** + +- Dense retrieval: map the text into a single embedding, e.g., [DPR](https://arxiv.org/abs/2004.04906), [BGE-v1.5](https://github.com/FlagOpen/FlagEmbedding) +- Sparse retrieval (lexical matching): a vector of size equal to the vocabulary, with the majority of positions set to zero, calculating a weight only for tokens present in the text. e.g., BM25, [unicoil](https://arxiv.org/pdf/2106.14807.pdf), and [splade](https://arxiv.org/abs/2107.05720) +- Multi-vector retrieval: use multiple vectors to represent a text, e.g., [ColBERT](https://arxiv.org/abs/2004.12832). + + +**2. How to use BGE-M3 in other projects?** + +For embedding retrieval, you can employ the BGE-M3 model using the same approach as BGE. +The only difference is that the BGE-M3 model no longer requires adding instructions to the queries. + +For hybrid retrieval, you can use [Vespa](https://github.com/vespa-engine/pyvespa/blob/master/docs/sphinx/source/examples/mother-of-all-embedding-models-cloud.ipynb +) and [Milvus](https://github.com/milvus-io/pymilvus/blob/master/examples/hello_hybrid_sparse_dense.py). + + +**3. How to fine-tune bge-M3 model?** + +You can follow the common in this [example](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/finetune) +to fine-tune the dense embedding. + +If you want to fine-tune all embedding function of m3 (dense, sparse and colbert), you can refer to the [unified_fine-tuning example](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/unified_finetune) + + + + + + +## Usage + +Install: +``` +git clone https://github.com/FlagOpen/FlagEmbedding.git +cd FlagEmbedding +pip install -e . +``` +or: +``` +pip install -U FlagEmbedding +``` + + + +### Generate Embedding for text + +- Dense Embedding +```python +from FlagEmbedding import BGEM3FlagModel + +model = BGEM3FlagModel('BAAI/bge-m3', + use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation + +sentences_1 = ["What is BGE M3?", "Defination of BM25"] +sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", + "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"] + +embeddings_1 = model.encode(sentences_1, + batch_size=12, + max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process. + )['dense_vecs'] +embeddings_2 = model.encode(sentences_2)['dense_vecs'] +similarity = embeddings_1 @ embeddings_2.T +print(similarity) +# [[0.6265, 0.3477], [0.3499, 0.678 ]] +``` +You also can use sentence-transformers and huggingface transformers to generate dense embeddings. +Refer to [baai_general_embedding](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/baai_general_embedding#usage) for details. + + +- Sparse Embedding (Lexical Weight) +```python +from FlagEmbedding import BGEM3FlagModel + +model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation + +sentences_1 = ["What is BGE M3?", "Defination of BM25"] +sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", + "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"] + +output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=False) +output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=False) + +# you can see the weight for each token: +print(model.convert_id_to_token(output_1['lexical_weights'])) +# [{'What': 0.08356, 'is': 0.0814, 'B': 0.1296, 'GE': 0.252, 'M': 0.1702, '3': 0.2695, '?': 0.04092}, +# {'De': 0.05005, 'fin': 0.1368, 'ation': 0.04498, 'of': 0.0633, 'BM': 0.2515, '25': 0.3335}] + + +# compute the scores via lexical mathcing +lexical_scores = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][0]) +print(lexical_scores) +# 0.19554901123046875 + +print(model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_1['lexical_weights'][1])) +# 0.0 +``` + +- Multi-Vector (ColBERT) +```python +from FlagEmbedding import BGEM3FlagModel + +model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) + +sentences_1 = ["What is BGE M3?", "Defination of BM25"] +sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", + "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"] + +output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True) +output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=True) + +print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][0])) +print(model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][1])) +# 0.7797 +# 0.4620 +``` + + +### Compute score for text pairs +Input a list of text pairs, you can get the scores computed by different methods. +```python +from FlagEmbedding import BGEM3FlagModel + +model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) + +sentences_1 = ["What is BGE M3?", "Defination of BM25"] +sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", + "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"] + +sentence_pairs = [[i,j] for i in sentences_1 for j in sentences_2] + +print(model.compute_score(sentence_pairs, + max_passage_length=128, # a smaller max length leads to a lower latency + weights_for_different_modes=[0.4, 0.2, 0.4])) # weights_for_different_modes(w) is used to do weighted sum: w[0]*dense_score + w[1]*sparse_score + w[2]*colbert_score + +# { +# 'colbert': [0.7796499729156494, 0.4621465802192688, 0.4523794651031494, 0.7898575067520142], +# 'sparse': [0.195556640625, 0.00879669189453125, 0.0, 0.1802978515625], +# 'dense': [0.6259765625, 0.347412109375, 0.349853515625, 0.67822265625], +# 'sparse+dense': [0.482503205537796, 0.23454029858112335, 0.2332356721162796, 0.5122477412223816], +# 'colbert+sparse+dense': [0.6013619303703308, 0.3255828022956848, 0.32089319825172424, 0.6232916116714478] +# } +``` + + + + +## Evaluation + +We provide the evaluation script for [MKQA](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MKQA) and [MLDR](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MLDR) + +### Benchmarks from the open-source community + ![avatar](./imgs/others.webp) + The BGE-M3 model emerged as the top performer on this benchmark (OAI is short for OpenAI). + For more details, please refer to the [article](https://towardsdatascience.com/openai-vs-open-source-multilingual-embedding-models-e5ccb7c90f05) and [Github Repo](https://github.com/Yannael/multilingual-embeddings) + + +### Our results +- Multilingual (Miracl dataset) + +![avatar](./imgs/miracl.jpg) + +- Cross-lingual (MKQA dataset) + +![avatar](./imgs/mkqa.jpg) + +- Long Document Retrieval + - MLDR: + ![avatar](./imgs/long.jpg) + Please note that [MLDR](https://huggingface.co/datasets/Shitao/MLDR) is a document retrieval dataset we constructed via LLM, + covering 13 languages, including test set, validation set, and training set. + We utilized the training set from MLDR to enhance the model's long document retrieval capabilities. + Therefore, comparing baselines with `Dense w.o.long`(fine-tuning without long document dataset) is more equitable. + Additionally, this long document retrieval dataset will be open-sourced to address the current lack of open-source multilingual long text retrieval datasets. + We believe that this data will be helpful for the open-source community in training document retrieval models. + + - NarritiveQA: + ![avatar](./imgs/nqa.jpg) + +- Comparison with BM25 + +We utilized Pyserini to implement BM25, and the test results can be reproduced by this [script](https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB/MLDR#bm25-baseline). +We tested BM25 using two different tokenizers: +one using Lucene Analyzer and the other using the same tokenizer as M3 (i.e., the tokenizer of xlm-roberta). +The results indicate that BM25 remains a competitive baseline, +especially in long document retrieval. + +![avatar](./imgs/bm25.jpg) + + + +## Training +- Self-knowledge Distillation: combining multiple outputs from different +retrieval modes as reward signal to enhance the performance of single mode(especially for sparse retrieval and multi-vec(colbert) retrival) +- Efficient Batching: Improve the efficiency when fine-tuning on long text. +The small-batch strategy is simple but effective, which also can used to fine-tune large embedding model. +- MCLS: A simple method to improve the performance on long text without fine-tuning. +If you have no enough resource to fine-tuning model with long text, the method is useful. + +Refer to our [report](https://arxiv.org/pdf/2402.03216.pdf) for more details. + + + + + + +## Acknowledgement + +Thanks to the authors of open-sourced datasets, including Miracl, MKQA, NarritiveQA, etc. +Thanks to the open-sourced libraries like [Tevatron](https://github.com/texttron/tevatron), [Pyserini](https://github.com/castorini/pyserini). + + + +## Citation + +If you find this repository useful, please consider giving a star :star: and citation + +``` +@misc{bge-m3, + title={BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings Through Self-Knowledge Distillation}, + author={Jianlv Chen and Shitao Xiao and Peitian Zhang and Kun Luo and Defu Lian and Zheng Liu}, + year={2024}, + eprint={2402.03216}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/victord/sub19/models/bge-m3/config.json b/victord/sub19/models/bge-m3/config.json new file mode 100644 index 0000000000000000000000000000000000000000..7383635394a240c036e6ec99340905ee2754b927 --- /dev/null +++ b/victord/sub19/models/bge-m3/config.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "XLMRobertaModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "classifier_dropout": null, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 8194, + "model_type": "xlm-roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/victord/sub19/models/bge-m3/config_sentence_transformers.json b/victord/sub19/models/bge-m3/config_sentence_transformers.json new file mode 100644 index 0000000000000000000000000000000000000000..aaa499896f85c8e2ba321476fd1479e476145577 --- /dev/null +++ b/victord/sub19/models/bge-m3/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "__version__": { + "sentence_transformers": "5.1.1", + "transformers": "4.53.2", + "pytorch": "2.9.0+cu128" + }, + "model_type": "SentenceTransformer", + "prompts": { + "query": "", + "document": "" + }, + "default_prompt_name": null, + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/victord/sub19/models/bge-m3/model.safetensors b/victord/sub19/models/bge-m3/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9b9d10ba9b13f7d50a8952cfc94dfde42c2fc1c6 --- /dev/null +++ b/victord/sub19/models/bge-m3/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebd7cf4fcc1a794f73ea3213266a7bd44e995eac0751b6e98ec9bb91cfc57202 +size 1135554736 diff --git a/victord/sub19/models/bge-m3/modules.json b/victord/sub19/models/bge-m3/modules.json new file mode 100644 index 0000000000000000000000000000000000000000..952a9b81c0bfd99800fabf352f69c7ccd46c5e43 --- /dev/null +++ b/victord/sub19/models/bge-m3/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/victord/sub19/models/bge-m3/sentence_bert_config.json b/victord/sub19/models/bge-m3/sentence_bert_config.json new file mode 100644 index 0000000000000000000000000000000000000000..306f5e30b3047fbad6af657cae7db9b911d72216 --- /dev/null +++ b/victord/sub19/models/bge-m3/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 8192, + "do_lower_case": false +} \ No newline at end of file diff --git a/victord/sub19/models/bge-m3/sentencepiece.bpe.model b/victord/sub19/models/bge-m3/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/victord/sub19/models/bge-m3/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/victord/sub19/models/bge-m3/special_tokens_map.json b/victord/sub19/models/bge-m3/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b1879d702821e753ffe4245048eee415d54a9385 --- /dev/null +++ b/victord/sub19/models/bge-m3/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/victord/sub19/models/bge-m3/tokenizer.json b/victord/sub19/models/bge-m3/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..a8c21d8691f041c9859f90256299e3f31e556d5b --- /dev/null +++ b/victord/sub19/models/bge-m3/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:249df0778f236f6ece390de0de746838ef25b9d6954b68c2ee71249e0a9d8fd4 +size 17082799 diff --git a/victord/sub19/models/bge-m3/tokenizer_config.json b/victord/sub19/models/bge-m3/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..95bd7c849ee6a47d5c92805af18d187239c1ba4a --- /dev/null +++ b/victord/sub19/models/bge-m3/tokenizer_config.json @@ -0,0 +1,56 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 8192, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "" +} diff --git a/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/README.md b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3b56065171aee9b58e5f488d2366e70b95872517 --- /dev/null +++ b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/README.md @@ -0,0 +1,503 @@ +--- +tags: +- sentence-transformers +- cross-encoder +- reranker +- generated_from_trainer +- dataset_size:175622 +- loss:BinaryCrossEntropyLoss +base_model: BAAI/bge-reranker-v2-m3 +pipeline_tag: text-ranking +library_name: sentence-transformers +metrics: +- pearson +- spearman +model-index: +- name: CrossEncoder based on BAAI/bge-reranker-v2-m3 + results: + - task: + type: cross-encoder-correlation + name: Cross Encoder Correlation + dataset: + name: sts dev + type: sts_dev + metrics: + - type: pearson + value: 0.5543320589612747 + name: Pearson + - type: spearman + value: 0.5369237805280626 + name: Spearman +--- + +# CrossEncoder based on BAAI/bge-reranker-v2-m3 + +This is a [Cross Encoder](https://www.sbert.net/docs/cross_encoder/usage/usage.html) model finetuned from [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) on the train_dataset, HebNLI, HebQA, RAGbot and ParaShoot datasets using the [sentence-transformers](https://www.SBERT.net) library. It computes scores for pairs of texts, which can be used for text reranking and semantic search. + +## Model Details + +### Model Description +- **Model Type:** Cross Encoder +- **Base model:** [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) +- **Maximum Sequence Length:** 2048 tokens +- **Number of Output Labels:** 1 label +- **Training Datasets:** + - train_dataset + - HebNLI + - HebQA + - RAGbot + - ParaShoot + + + +### Model Sources + +- **Documentation:** [Sentence Transformers Documentation](https://sbert.net) +- **Documentation:** [Cross Encoder Documentation](https://www.sbert.net/docs/cross_encoder/usage/usage.html) +- **Repository:** [Sentence Transformers on GitHub](https://github.com/huggingface/sentence-transformers) +- **Hugging Face:** [Cross Encoders on Hugging Face](https://huggingface.co/models?library=sentence-transformers&other=cross-encoder) + +## Usage + +### Direct Usage (Sentence Transformers) + +First install the Sentence Transformers library: + +```bash +pip install -U sentence-transformers +``` + +Then you can load this model and run inference. +```python +from sentence_transformers import CrossEncoder + +# Download from the 🤗 Hub +model = CrossEncoder("cross_encoder_model_id") +# Get scores for pairs of texts +pairs = [ + ['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'דרגות השתתפות במימון מעונות יום ומשפחתונים\n \nגובה שכר הלימוד\nסכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.\nהטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:\n\nגובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.\nהדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.\nככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.\nעבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).\nתינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמבר של שנת הלימודים אליה נרשמו.\nילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).\n\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\nדרגה בהתאם לרמת הכנסה\n: 3\nמספר הילדים\nהזכאים במשפחה \n\n: 2\nהדרגה הסופית\nהמותאמת \n\n: 14\n\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\nדרגה בהתאם לרמת הכנסה\n: 3\nמספר הילדים\nהזכאים במשפחה \n\n: 3 ומעלה\nהדרגה הסופית\nהמותאמת \n\n: 15\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\n\n\nדרגה בהתאם לרמת הכנסה\n\nמספר הילדים\nהזכאים במשפחה \n\n\nהדרגה הסופית\nהמותאמת \n\n\n\n3\n2\n14\n\n\n3\n3 ומעלה\n15'], + ['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים\n \nהורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)\n\n\n \nהורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים\nכדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.\n\nגובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.\nהורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.\nלפרטים ומידע כללי על הזכות, ראו השתתפות במימון מעונות יום ומשפחתונים.'], + ['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'השתתפות במימון מעונות יום ומשפחתונים\n תהליך מימוש הזכות\nאיתור מסגרת מוכרת\nמערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.\nניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.\n\nהתשלום למעון/משפחתון\nעבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).\nהחזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.\nהשתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):\n\nהוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).\nהוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.\n\nהיעדרות של הילד\nהיעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.\nהיעדרות של 45 ימים ומעלה בשל אשפוז או מחלה, תיחשב כעזיבה החל מהיום ה-45. חזרה של ילד למעון לאחר מכן, תחייב את ההורה להגיש בקשה חדשה לקביעת דרגת ההשתתפות בצירוף מסמכים עדכניים.\n\nשינוי בתנאי הזכאות\nהזכאות נמשכת כל עוד מתקיימים התנאים המזכים.\nבמקרה שחל שינוי בתנאים - ההורים חייבים למסור על כך הודעה בכתב לאגף מעונות יום ומשפחתונים, לא יאוחר מ-30 יום ממועד השינוי.\n\nבמהלך שנת לימודים, יתכן שיידרשו מההורים נתוני הכנסה מעודכנים כתנאי להמשך תשלום התמיכה.\nגובה התמיכה יעודכן, כלפי מעלה או מטה, על בסיס הנתונים החדשים.\nאת העדכון יש לבצע באמצעות מערכת עדכונט באתר משרד העבודה. ניתן גם להיעזר במוקד מעונות יום, משפחתונים וצהרונים, בטלפון *2969.'], + ['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'מדריך למי שפונו או התפנו מבתיהם עקב מלחמת חרבות ברזל\n Metadata\nמענק למעונות יום ומשפחתונים שלא גבו כסף מההורים או החזירו חלק מהתשלום להורים\nמעונות יום ומשפחתונים בעלי סמל, שבהתאם להנחיות פיקוד העורף, לא פעלו במהלך חודש אוקטובר או שפעלו באופן חלקי בלבד, זכאים למענק סיוע חד-פעמי בגין הימים שבהם לא פעלו.\nהמענק יינתן בתנאי שלא גבו מההורים תשלום עבור הימים שבהם לא היתה פעילות, ואם היתה פעילות חלקית - הם השיבו להורים את החלק היחסי מהתשלום.\nגובה המענק הוא 889 ₪ עבור כל ילד במעון.\nלא ניתן לקבל גם מענק ממשרד העבודה וגם פיצוי מרשות המסים. מעון או משפחתון שקיבל מענק ממשרד העבודה והגיש תביעה לפיצוי מרשות המסים, סכום המענק יופחת מגובה הפיצוי שיקבל מרשות המסים.\nלמידע נוסף ראו מענק למעונות ומשפחתונים מוכרים בתקופת מלחמת חרבות ברזל'], + ['מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', 'מדריך הורות משותפת\n מענקים וקצבאות\n\n\nמיקום הילד/ה במשפחה\nסכום הקצבה עבור הילד/ה (נכון ל-2024)\n\n\nילד ראשון\n169 ₪\n\n\nילד שני\n214 ₪\n\n\nילד שלישי\n214 ₪ (+ 111 ₪ להורים שמקבלים קצבאות קיום)\n\n\nילד רביעי\n214 ₪ (+ 111 ₪ להורים שמקבלים קצבאות קיום)\n\n\nילד חמישי ואילך\n169 ₪\n\nלשיעורי הקצבה בשנים הקודמות ראו באתר המוסד לביטוח לאומי.\nלמידע נוסף ראו קצבת ילדים.\n\nמעונות יום\nקדימות בקבלה\nהורים העונים על קריטריונים שונים, עשויים להיות זכאים לקדימות בקבלה למעונות היום שנמצאים באחריות משרד הרווחה.\nבין הקריטריונים נמצאים "הורים עצמאים", הכוללים הורה לא נשוי ושאין לו ידוע בציבור, למשל, אישה בהורות משותפת.\nמעונות יום\nהשתתפות במימון\nכדי לעודד את שילובם של הורים בשוק העבודה, המדינה מסייעת במימון מעונות יום ומשפחתונים מוכרים. גובה הסיוע ניתן בהתאם לרמת ההכנסה.\nלמידע נוסף ראו השתתפות במימון מעונות יום ומשפחתונים.'], +] +scores = model.predict(pairs) +print(scores.shape) +# (5,) + +# Or rank different texts based on similarity to a single text +ranks = model.rank( + 'מה התשלום עבור דרגה X במעון או משפחתון?\n מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?', + [ + 'דרגות השתתפות במימון מעונות יום ומשפחתונים\n \nגובה שכר הלימוד\nסכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.\nהטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:\n\nגובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.\nהדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.\nככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.\nעבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).\nתינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמבר של שנת הלימודים אליה נרשמו.\nילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).\n\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\nדרגה בהתאם לרמת הכנסה\n: 3\nמספר הילדים\nהזכאים במשפחה \n\n: 2\nהדרגה הסופית\nהמותאמת \n\n: 14\n\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\nדרגה בהתאם לרמת הכנסה\n: 3\nמספר הילדים\nהזכאים במשפחה \n\n: 3 ומעלה\nהדרגה הסופית\nהמותאמת \n\n: 15\nהשפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפות\n\n\nדרגה בהתאם לרמת הכנסה\n\nמספר הילדים\nהזכאים במשפחה \n\n\nהדרגה הסופית\nהמותאמת \n\n\n\n3\n2\n14\n\n\n3\n3 ומעלה\n15', + 'סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים\n \nהורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)\n\n\n \nהורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים\nכדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.\n\nגובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.\nהורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.\nלפרטים ומידע כללי על הזכות, ראו השתתפות במימון מעונות יום ומשפחתונים.', + 'השתתפות במימון מעונות יום ומשפחתונים\n תהליך מימוש הזכות\nאיתור מסגרת מוכרת\nמערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.\nניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.\n\nהתשלום למעון/משפחתון\nעבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).\nהחזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.\nהשתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):\n\nהוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).\nהוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.\n\nהיעדרות של הילד\nהיעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.\nהיעדרות של 45 ימים ומעלה בשל אשפוז או מחלה, תיחשב כעזיבה החל מהיום ה-45. חזרה של ילד למעון לאחר מכן, תחייב את ההורה להגיש בקשה חדשה לקביעת דרגת ההשתתפות בצירוף מסמכים עדכניים.\n\nשינוי בתנאי הזכאות\nהזכאות נמשכת כל עוד מתקיימים התנאים המזכים.\nבמקרה שחל שינוי בתנאים - ההורים חייבים למסור על כך הודעה בכתב לאגף מעונות יום ומשפחתונים, לא יאוחר מ-30 יום ממועד השינוי.\n\nבמהלך שנת לימודים, יתכן שיידרשו מההורים נתוני הכנסה מעודכנים כתנאי להמשך תשלום התמיכה.\nגובה התמיכה יעודכן, כלפי מעלה או מטה, על בסיס הנתונים החדשים.\nאת העדכון יש לבצע באמצעות מערכת עדכונט באתר משרד העבודה. ניתן גם להיעזר במוקד מעונות יום, משפחתונים וצהרונים, בטלפון *2969.', + 'מדריך למי שפונו או התפנו מבתיהם עקב מלחמת חרבות ברזל\n Metadata\nמענק למעונות יום ומשפחתונים שלא גבו כסף מההורים או החזירו חלק מהתשלום להורים\nמעונות יום ומשפחתונים בעלי סמל, שבהתאם להנחיות פיקוד העורף, לא פעלו במהלך חודש אוקטובר או שפעלו באופן חלקי בלבד, זכאים למענק סיוע חד-פעמי בגין הימים שבהם לא פעלו.\nהמענק יינתן בתנאי שלא גבו מההורים תשלום עבור הימים שבהם לא היתה פעילות, ואם היתה פעילות חלקית - הם השיבו להורים את החלק היחסי מהתשלום.\nגובה המענק הוא 889 ₪ עבור כל ילד במעון.\nלא ניתן לקבל גם מענק ממשרד העבודה וגם פיצוי מרשות המסים. מעון או משפחתון שקיבל מענק ממשרד העבודה והגיש תביעה לפיצוי מרשות המסים, סכום המענק יופחת מגובה הפיצוי שיקבל מרשות המסים.\nלמידע נוסף ראו מענק למעונות ומשפחתונים מוכרים בתקופת מלחמת חרבות ברזל', + 'מדריך הורות משותפת\n מענקים וקצבאות\n\n\nמיקום הילד/ה במשפחה\nסכום הקצבה עבור הילד/ה (נכון ל-2024)\n\n\nילד ראשון\n169 ₪\n\n\nילד שני\n214 ₪\n\n\nילד שלישי\n214 ₪ (+ 111 ₪ להורים שמקבלים קצבאות קיום)\n\n\nילד רביעי\n214 ₪ (+ 111 ₪ להורים שמקבלים קצבאות קיום)\n\n\nילד חמישי ואילך\n169 ₪\n\nלשיעורי הקצבה בשנים הקודמות ראו באתר המוסד לביטוח לאומי.\nלמידע נוסף ראו קצבת ילדים.\n\nמעונות יום\nקדימות בקבלה\nהורים העונים על קריטריונים שונים, עשויים להיות זכאים לקדימות בקבלה למעונות היום שנמצאים באחריות משרד הרווחה.\nבין הקריטריונים נמצאים "הורים עצמאים", הכוללים הורה לא נשוי ושאין לו ידוע בציבור, למשל, אישה בהורות משותפת.\nמעונות יום\nהשתתפות במימון\nכדי לעודד את שילובם של הורים בשוק העבודה, המדינה מסייעת במימון מעונות יום ומשפחתונים מוכרים. גובה הסיוע ניתן בהתאם לרמת ההכנסה.\nלמידע נוסף ראו השתתפות במימון מעונות יום ומשפחתונים.', + ] +) +# [{'corpus_id': ..., 'score': ...}, {'corpus_id': ..., 'score': ...}, ...] +``` + + + + + + + +## Evaluation + +### Metrics + +#### Cross Encoder Correlation + +* Dataset: `sts_dev` +* Evaluated with [CrossEncoderCorrelationEvaluator](https://sbert.net/docs/package_reference/cross_encoder/evaluation.html#sentence_transformers.cross_encoder.evaluation.CrossEncoderCorrelationEvaluator) + +| Metric | Value | +|:-------------|:-----------| +| pearson | 0.5543 | +| **spearman** | **0.5369** | + + + + + +## Training Details + +### Training Datasets +
train_dataset + +#### train_dataset + +* Dataset: train_dataset +* Size: 40,680 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------|:---------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 14 characters
  • mean: 46.84 characters
  • max: 134 characters
|
  • min: 86 characters
  • mean: 1021.29 characters
  • max: 9471 characters
|
  • min: 0.0
  • mean: 0.31
  • max: 1.0
| +* Samples: + | sentence1 | sentence2 | score | + |:---------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------| + | מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| דרגות השתתפות במימון מעונות יום ומשפחתונים

גובה שכר הלימוד
סכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.
הטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:

גובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.
הדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.
ככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.
עבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).
תינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמבר של שנת הלימודים אליה נרשמו.
ילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).

השפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפ...
| 0.8 | + | מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים

הורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)



הורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים
כדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.

גובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.
הורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.
לפרטים ומידע כללי ע...
| 0.7 | + | מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| השתתפות במימון מעונות יום ומשפחתונים
תהליך מימוש הזכות
איתור מסגרת מוכרת
מערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.
ניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.

התשלום למעון/משפחתון
עבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).
החזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.
השתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):

הוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).
הוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.

היעדרות של הילד
היעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.
היעדרות של 45...
| 0.8 | +* Loss: [BinaryCrossEntropyLoss](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters: + ```json + { + "activation_fn": "torch.nn.modules.linear.Identity", + "pos_weight": null + } + ``` +
+
HebNLI + +#### HebNLI + +* Dataset: HebNLI +* Size: 60,792 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:-----------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------|:-----------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 4 characters
  • mean: 86.89 characters
  • max: 400 characters
|
  • min: 8 characters
  • mean: 43.86 characters
  • max: 153 characters
|
  • min: 0.01
  • mean: 0.41
  • max: 0.99
| +* Samples: + | sentence1 | sentence2 | score | + |:---------------------------------------------------------------------------------|:---------------------------------------------------------------------------------|:------------------| + | הצלחנו, אמרה טופנס. | טופנס אמרה שהם ניצחו. | 0.99 | + | אחרי שלושה ימים של אימונים אינטנסיביים הם סיימו את הקורס בהצטיינות. | הם השלימו את המסלול עם שעות לאחר שלושה ימים של אימונים אינטנסיביים. | 0.99 | + | הייתי מציע לך לנסוע לרוסיה מיד. | הייתי מציע לך להכין את הדרך לרוסיה באופן מיידי. | 0.99 | +* Loss: [BinaryCrossEntropyLoss](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters: + ```json + { + "activation_fn": "torch.nn.modules.linear.Identity", + "pos_weight": null + } + ``` +
+
HebQA + +#### HebQA + +* Dataset: HebQA +* Size: 60,294 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:----------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 13 characters
  • mean: 37.7 characters
  • max: 91 characters
|
  • min: 504 characters
  • mean: 695.62 characters
  • max: 1311 characters
|
  • min: 0.01
  • mean: 0.75
  • max: 0.99
| +* Samples: + | sentence1 | sentence2 | score | + |:------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------| + | כמה אנשים היו בוועדה שניסחה את הטיוטה הרביעית? | לאחר שהטיוטות הראשוניות הובאו למנהלת העם וספגו ביקורת חריפה, הוקמה ועדת חמישה בראשות משה שרת שניסחה טיוטה רביעית. ב-13 במאי לפנות ערב הוגשה הצעתה למנהלת העם. שרת עשה את רוב המלאכה, תוך התייעצות עם משפטנים מומחים. נוסח ההכרזה הושאל מכתב המנדט וממסמכים משפטיים רבים אחרים, וכל סעיף שלו התחיל במילים "הואיל ו...". בהצעה זו הוזכרה גם תוכנית החלוקה של האו"ם. מזכיר מנהלת העם כתב כי "בן-גוריון התנגד ל'הואיל' כי אינו עברי" וכן "התנגד למילים 'ישוב רב־איל', 'עוז וגבורה' וכיוצא באלה". כמו כן, התנגד להזכרה מפורשת של תוכנית החלוקה. | 0.8592790964302004 | + | כמה אנשים היו בוועדה שניסחה את הטיוטה הרביעית? | לאחר שהטיוטות הראשוניות הובאו למנהלת העם וספגו ביקורת חריפה, הוקמה ועדת חמישה בראשות משה שרת שניסחה טיוטה רביעית. ב-13 במאי לפנות ערב הוגשה הצעתה למנהלת העם. שרת עשה את רוב המלאכה, תוך התייעצות עם משפטנים מומחים. נוסח ההכרזה הושאל מכתב המנדט וממסמכים משפטיים רבים אחרים, וכל סעיף שלו התחיל במילים "הואיל ו...". בהצעה זו הוזכרה גם תוכנית החלוקה של האו"ם. מזכיר מנהלת העם כתב כי "בן-גוריון התנגד ל'הואיל' כי אינו עברי" וכן "התנגד למילים 'ישוב רב־איל', 'עוז וגבורה' וכיוצא באלה". כמו כן, התנגד להזכרה מפורשת של תוכנית החלוקה. | 0.99 | + | באילו מילים התחילו כל סעיפי הטויוטה? | לאחר שהטיוטות הראשוניות הובאו למנהלת העם וספגו ביקורת חריפה, הוקמה ועדת חמישה בראשות משה שרת שניסחה טיוטה רביעית. ב-13 במאי לפנות ערב הוגשה הצעתה למנהלת העם. שרת עשה את רוב המלאכה, תוך התייעצות עם משפטנים מומחים. נוסח ההכרזה הושאל מכתב המנדט וממסמכים משפטיים רבים אחרים, וכל סעיף שלו התחיל במילים "הואיל ו...". בהצעה זו הוזכרה גם תוכנית החלוקה של האו"ם. מזכיר מנהלת העם כתב כי "בן-גוריון התנגד ל'הואיל' כי אינו עברי" וכן "התנגד למילים 'ישוב רב־איל', 'עוז וגבורה' וכיוצא באלה". כמו כן, התנגד להזכרה מפורשת של תוכנית החלוקה. | 0.25883327354329505 | +* Loss: [BinaryCrossEntropyLoss](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters: + ```json + { + "activation_fn": "torch.nn.modules.linear.Identity", + "pos_weight": null + } + ``` +
+
RAGbot + +#### RAGbot + +* Dataset: RAGbot +* Size: 7,780 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 10 characters
  • mean: 60.23 characters
  • max: 424 characters
|
  • min: 211 characters
  • mean: 1312.64 characters
  • max: 4772 characters
|
  • min: 0.01
  • mean: 0.71
  • max: 0.99
| +* Samples: + | sentence1 | sentence2 | score | + |:--------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------| + | לא ברור באתר
אם על חברות ביטוח פרטיות
חל חוק חופש המידע
אני מציע שזה יהיה כתוב מפורש
| חוק חופש המידע, התשנ"ח-1998 מאפשר לכל אזרח או תושב ישראל (/he/תושב_ישראל) לפנות בבקשה לרשות ציבורית לקבל מידע אותו היא מחזיקה. מטרתו הראשית של החוק היא להטמיע תפיסת יסוד של שיתוף במידע הציבורי.
פרטים

שם החוק:חוק חופש המידע, התשנ"ח-1998
קישור:החוק באתר נבו
שר אחראי:שר המשפטים
החוק ב"ספר החוקים הפתוח"
נושאים וזכויות
חופש המידע
הזכות לפרטיות
| 0.02321811595881069 | + | לא ברור באתר
אם על חברות ביטוח פרטיות
חל חוק חופש המידע
אני מציע שזה יהיה כתוב מפורש
| חוק חופש המידע, התשנ"ח-1998 מאפשר לכל אזרח או תושב ישראל (/he/תושב_ישראל) לפנות בבקשה לרשות ציבורית לקבל מידע אותו היא מחזיקה. מטרתו הראשית של החוק היא להטמיע תפיסת יסוד של שיתוף במידע הציבורי.
פרטים

שם החוק:חוק חופש המידע, התשנ"ח-1998
קישור:החוק באתר נבו
שר אחראי:שר המשפטים
החוק ב"ספר החוקים הפתוח"
נושאים וזכויות
חופש המידע
הזכות לפרטיות
| 0.99 | + | לא ברור באתר
אם על חברות ביטוח פרטיות
חל חוק חופש המידע
אני מציע שזה יהיה כתוב מפורש
| כל אזרח או תושב ישראל זכאי לקבלת מידע מרשות ציבורית (https://www.gov.il/he/departments/general/list_of_authorities)



לצורך קבלת המידע יש לפנות לממונה על יישום חוק חופש המידע ברשות הציבורית



לפני הגשת בקשה לקבלת מידע, מומלץ לברר אם המידע כבר קיים במאגר התשובות (https://www.gov.il/he/departments/general/answer_reservoir) של היחידה הממשלתית לחופש המידע במשרד המשפטים



הגשת הבקשה כרוכה בתשלום אגרה, אך במקרים מסוימים ניתן פטור מתשלום האגרה



למידע נוסף ראו הגשת בקשת חופש המידע (https://www.gov.il/he/service/freedom_of_information_submission) באתר היחידה הממשלתית לחופש המידע במשרד המשפטים
חוק חופש המידע קובע כי כל אזרח או תושב ישראל זכאי לקבל מידע מרשות ציבורית.

לרשימה המלאה של הרשויות הציבוריות שעליהן חל החוק ראו כאן.

אוכלוסיית יעד ותנאים מקדימים
כל אזרח או תושב ישראל.

למי ואיך פונים
יש לבדוק איזו רשות אחראית על הנושא שלגביו מבוקש המידע.
יש לפנות לממונה על יישום חוק חופש המידע ברשות האחראית:

לרשימת הממונים על יישום חוק חופש המידע ברשויות השונות באתר היחידה הממשלתית לחופש המידע...
| 0.01 | +* Loss: [BinaryCrossEntropyLoss](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters: + ```json + { + "activation_fn": "torch.nn.modules.linear.Identity", + "pos_weight": null + } + ``` +
+
ParaShoot + +#### ParaShoot + +* Dataset: ParaShoot +* Size: 6,076 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:-----------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 9 characters
  • mean: 37.13 characters
  • max: 119 characters
|
  • min: 500 characters
  • mean: 746.67 characters
  • max: 2042 characters
|
  • min: 0.01
  • mean: 0.91
  • max: 0.99
| +* Samples: + | sentence1 | sentence2 | score | + |:----------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------| + | במה בתחילת דרכה עסקה חברתו של אהרון רוזנפלד? | בתחילה עסקה חברתו מכל הבא ליד אך עם הגברת הבנייה בשנות ה-20 יצר רוזנפלד קשרים בבלגיה, ייבא משם חומרי בניין (בעיקר זכוכית) וייצג את חברת הספנות הבלגית שהביאה את החומרים הללו. כך עגנה ב-1923 בחיפה האונייה הבלגית הראשונה. רוזנפלד היה גם סוכנן של חברות ספנות ואוניות אמריקאיות. בשל פיתוח יחסי העסקים עם בלגיה התמנה ב-1929 לסגן-קונסול וב-1934 לקונסול בלגיה וליבריה בחיפה. ב-1932 קיבל מאלברט הראשון מלך בלגיה את אות הכבוד של אביר מסדר הכתר הבלגי . רוזנפלד השתתף בהקמתם של המרכז המסחרי הישן והחדש בעיר התחתית, והיה ממקימי "בנק בעלי הבתים בע"מ" ב-15 ביולי 1931. רוזנפלד היה חבר בארגונים חברתיים ועסקיים שונים, ובין השאר היה נשיא הבונים החופשיים, חבר ועדת הנמל בלשכת המסחר והתעשייה העברית ולשכת הספנות החיפאית (מ"מ נשיא לשכת הספנות באמצע שנות ה-60) ואזרח כבוד של חיפה. | 0.9341320672743826 | + | במה בתחילת דרכה עסקה חברתו של אהרון רוזנפלד? | בתחילה עסקה חברתו מכל הבא ליד אך עם הגברת הבנייה בשנות ה-20 יצר רוזנפלד קשרים בבלגיה, ייבא משם חומרי בניין (בעיקר זכוכית) וייצג את חברת הספנות הבלגית שהביאה את החומרים הללו. כך עגנה ב-1923 בחיפה האונייה הבלגית הראשונה. רוזנפלד היה גם סוכנן של חברות ספנות ואוניות אמריקאיות. בשל פיתוח יחסי העסקים עם בלגיה התמנה ב-1929 לסגן-קונסול וב-1934 לקונסול בלגיה וליבריה בחיפה. ב-1932 קיבל מאלברט הראשון מלך בלגיה את אות הכבוד של אביר מסדר הכתר הבלגי . רוזנפלד השתתף בהקמתם של המרכז המסחרי הישן והחדש בעיר התחתית, והיה ממקימי "בנק בעלי הבתים בע"מ" ב-15 ביולי 1931. רוזנפלד היה חבר בארגונים חברתיים ועסקיים שונים, ובין השאר היה נשיא הבונים החופשיים, חבר ועדת הנמל בלשכת המסחר והתעשייה העברית ולשכת הספנות החיפאית (מ"מ נשיא לשכת הספנות באמצע שנות ה-60) ואזרח כבוד של חיפה. | 0.99 | + | מה אירע בחיפה לראשונה בשנת 1923? | בתחילה עסקה חברתו מכל הבא ליד אך עם הגברת הבנייה בשנות ה-20 יצר רוזנפלד קשרים בבלגיה, ייבא משם חומרי בניין (בעיקר זכוכית) וייצג את חברת הספנות הבלגית שהביאה את החומרים הללו. כך עגנה ב-1923 בחיפה האונייה הבלגית הראשונה. רוזנפלד היה גם סוכנן של חברות ספנות ואוניות אמריקאיות. בשל פיתוח יחסי העסקים עם בלגיה התמנה ב-1929 לסגן-קונסול וב-1934 לקונסול בלגיה וליבריה בחיפה. ב-1932 קיבל מאלברט הראשון מלך בלגיה את אות הכבוד של אביר מסדר הכתר הבלגי . רוזנפלד השתתף בהקמתם של המרכז המסחרי הישן והחדש בעיר התחתית, והיה ממקימי "בנק בעלי הבתים בע"מ" ב-15 ביולי 1931. רוזנפלד היה חבר בארגונים חברתיים ועסקיים שונים, ובין השאר היה נשיא הבונים החופשיים, חבר ועדת הנמל בלשכת המסחר והתעשייה העברית ולשכת הספנות החיפאית (מ"מ נשיא לשכת הספנות באמצע שנות ה-60) ואזרח כבוד של חיפה. | 0.4503647439465404 | +* Loss: [BinaryCrossEntropyLoss](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters: + ```json + { + "activation_fn": "torch.nn.modules.linear.Identity", + "pos_weight": null + } + ``` +
+ +### Evaluation Dataset + +#### Unnamed Dataset + +* Size: 40,680 evaluation samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------|:---------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 14 characters
  • mean: 46.84 characters
  • max: 134 characters
|
  • min: 86 characters
  • mean: 1021.29 characters
  • max: 9471 characters
|
  • min: 0.0
  • mean: 0.31
  • max: 1.0
| +* Samples: + | sentence1 | sentence2 | score | + |:---------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------| + | מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| דרגות השתתפות במימון מעונות יום ומשפחתונים

גובה שכר הלימוד
סכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.
הטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:

גובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.
הדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.
ככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.
עבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).
תינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמבר של שנת הלימודים אליה נרשמו.
ילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).

השפעת מספר הילדים השוהים במעון/משפחתון על דרגת ההשתתפ...
| 0.8 | + | מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים

הורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)



הורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים
כדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.

גובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.
הורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.
לפרטים ומידע כללי ע...
| 0.7 | + | מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| השתתפות במימון מעונות יום ומשפחתונים
תהליך מימוש הזכות
איתור מסגרת מוכרת
מערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.
ניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.

התשלום למעון/משפחתון
עבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).
החזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.
השתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):

הוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).
הוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.

היעדרות של הילד
היעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.
היעדרות של 45...
| 0.8 | +* Loss: [BinaryCrossEntropyLoss](https://sbert.net/docs/package_reference/cross_encoder/losses.html#binarycrossentropyloss) with these parameters: + ```json + { + "activation_fn": "torch.nn.modules.linear.Identity", + "pos_weight": null + } + ``` + +### Training Hyperparameters +#### Non-Default Hyperparameters + +- `eval_strategy`: steps +- `per_device_train_batch_size`: 16 +- `per_device_eval_batch_size`: 16 +- `gradient_accumulation_steps`: 8 +- `learning_rate`: 4e-05 +- `weight_decay`: 0.01 +- `max_grad_norm`: 2.0 +- `num_train_epochs`: 2 +- `max_steps`: 300 +- `warmup_steps`: 300 +- `bf16`: True +- `tf32`: True +- `torch_compile`: True +- `torch_compile_backend`: inductor +- `batch_sampler`: no_duplicates + +#### All Hyperparameters +
Click to expand + +- `overwrite_output_dir`: False +- `do_predict`: False +- `eval_strategy`: steps +- `prediction_loss_only`: True +- `per_device_train_batch_size`: 16 +- `per_device_eval_batch_size`: 16 +- `per_gpu_train_batch_size`: None +- `per_gpu_eval_batch_size`: None +- `gradient_accumulation_steps`: 8 +- `eval_accumulation_steps`: None +- `torch_empty_cache_steps`: None +- `learning_rate`: 4e-05 +- `weight_decay`: 0.01 +- `adam_beta1`: 0.9 +- `adam_beta2`: 0.999 +- `adam_epsilon`: 1e-08 +- `max_grad_norm`: 2.0 +- `num_train_epochs`: 2 +- `max_steps`: 300 +- `lr_scheduler_type`: linear +- `lr_scheduler_kwargs`: {} +- `warmup_ratio`: 0.0 +- `warmup_steps`: 300 +- `log_level`: passive +- `log_level_replica`: warning +- `log_on_each_node`: True +- `logging_nan_inf_filter`: True +- `save_safetensors`: True +- `save_on_each_node`: False +- `save_only_model`: False +- `restore_callback_states_from_checkpoint`: False +- `no_cuda`: False +- `use_cpu`: False +- `use_mps_device`: False +- `seed`: 42 +- `data_seed`: None +- `jit_mode_eval`: False +- `use_ipex`: False +- `bf16`: True +- `fp16`: False +- `fp16_opt_level`: O1 +- `half_precision_backend`: auto +- `bf16_full_eval`: False +- `fp16_full_eval`: False +- `tf32`: True +- `local_rank`: 0 +- `ddp_backend`: None +- `tpu_num_cores`: None +- `tpu_metrics_debug`: False +- `debug`: [] +- `dataloader_drop_last`: False +- `dataloader_num_workers`: 0 +- `dataloader_prefetch_factor`: None +- `past_index`: -1 +- `disable_tqdm`: False +- `remove_unused_columns`: True +- `label_names`: None +- `load_best_model_at_end`: False +- `ignore_data_skip`: False +- `fsdp`: [] +- `fsdp_min_num_params`: 0 +- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False} +- `fsdp_transformer_layer_cls_to_wrap`: None +- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None} +- `deepspeed`: None +- `label_smoothing_factor`: 0.0 +- `optim`: adamw_torch +- `optim_args`: None +- `adafactor`: False +- `group_by_length`: False +- `length_column_name`: length +- `ddp_find_unused_parameters`: None +- `ddp_bucket_cap_mb`: None +- `ddp_broadcast_buffers`: False +- `dataloader_pin_memory`: True +- `dataloader_persistent_workers`: False +- `skip_memory_metrics`: True +- `use_legacy_prediction_loop`: False +- `push_to_hub`: False +- `resume_from_checkpoint`: None +- `hub_model_id`: None +- `hub_strategy`: every_save +- `hub_private_repo`: None +- `hub_always_push`: False +- `hub_revision`: None +- `gradient_checkpointing`: False +- `gradient_checkpointing_kwargs`: None +- `include_inputs_for_metrics`: False +- `include_for_metrics`: [] +- `eval_do_concat_batches`: True +- `fp16_backend`: auto +- `push_to_hub_model_id`: None +- `push_to_hub_organization`: None +- `mp_parameters`: +- `auto_find_batch_size`: False +- `full_determinism`: False +- `torchdynamo`: None +- `ray_scope`: last +- `ddp_timeout`: 1800 +- `torch_compile`: True +- `torch_compile_backend`: inductor +- `torch_compile_mode`: None +- `include_tokens_per_second`: False +- `include_num_input_tokens_seen`: False +- `neftune_noise_alpha`: None +- `optim_target_modules`: None +- `batch_eval_metrics`: False +- `eval_on_start`: False +- `use_liger_kernel`: False +- `liger_kernel_config`: None +- `eval_use_gather_object`: False +- `average_tokens_across_devices`: False +- `prompts`: None +- `batch_sampler`: no_duplicates +- `multi_dataset_batch_sampler`: proportional +- `router_mapping`: {} +- `learning_rate_mapping`: {} + +
+ +### Training Logs +| Epoch | Step | Training Loss | Validation Loss | sts_dev_spearman | +|:------:|:----:|:-------------:|:---------------:|:----------------:| +| -1 | -1 | - | - | 0.4608 | +| 0.2186 | 300 | 0.4926 | 0.4868 | 0.5369 | +| -1 | -1 | - | - | 0.5369 | + + +### Framework Versions +- Python: 3.10.16 +- Sentence Transformers: 5.1.2 +- Transformers: 4.53.2 +- PyTorch: 2.9.0+cu128 +- Accelerate: 1.10.1 +- Datasets: 4.2.0 +- Tokenizers: 0.21.4 + +## Citation + +### BibTeX + +#### Sentence Transformers +```bibtex +@inproceedings{reimers-2019-sentence-bert, + title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", + author = "Reimers, Nils and Gurevych, Iryna", + booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", + month = "11", + year = "2019", + publisher = "Association for Computational Linguistics", + url = "https://arxiv.org/abs/1908.10084", +} +``` + + + + + + \ No newline at end of file diff --git a/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/config.json b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/config.json new file mode 100644 index 0000000000000000000000000000000000000000..875cb725c70258e04b4d2d7c6289d7a235eaac38 --- /dev/null +++ b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/config.json @@ -0,0 +1,37 @@ +{ + "architectures": [ + "XLMRobertaForSequenceClassification" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "classifier_dropout": null, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "label2id": { + "LABEL_0": 0 + }, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 8194, + "model_type": "xlm-roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "sentence_transformers": { + "activation_fn": "torch.nn.modules.activation.Sigmoid", + "version": "5.1.2" + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/model.safetensors b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..70d1dd0d8640185a432fef09dbe7da9f2a279f73 --- /dev/null +++ b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c4e2582945a051835252f00c9967c0083f87e9b56af3fbc482c5eeeb31e8b68 +size 1135560090 diff --git a/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/sentencepiece.bpe.model b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/special_tokens_map.json b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b1879d702821e753ffe4245048eee415d54a9385 --- /dev/null +++ b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer.json b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..23e2b17191098a68bd13cf199532fcc16692805d --- /dev/null +++ b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f346050eed4b822e076c8958539a82bd05b7828c9110f81e7ed7a6d1b853710b +size 17083154 diff --git a/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer_config.json b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0895d3866ac41867612360eff24bf05261335c20 --- /dev/null +++ b/victord/sub19/models/bge-reranker-v2-m3_pseudo_tune_full/tokenizer_config.json @@ -0,0 +1,57 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "max_length": 2048, + "model_max_length": 2048, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "" +} diff --git a/victord/sub19/models/multilingual-e5-large-instruct/1_Pooling/config.json b/victord/sub19/models/multilingual-e5-large-instruct/1_Pooling/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f060ee536308b48017dad1a834f306f115695a3 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 1024, + "pooling_mode_cls_token": false, + "pooling_mode_mean_tokens": true, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": false, + "include_prompt": true +} \ No newline at end of file diff --git a/victord/sub19/models/multilingual-e5-large-instruct/README.md b/victord/sub19/models/multilingual-e5-large-instruct/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f48430f7761354d4528fda264ffe828e6e6aa1c4 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/README.md @@ -0,0 +1,5538 @@ +--- +tags: +- mteb +- sentence-transformers +- transformers +model-index: +- name: multilingual-e5-large-instruct + results: + - task: + type: Classification + dataset: + type: mteb/amazon_counterfactual + name: MTEB AmazonCounterfactualClassification (en) + config: en + split: test + revision: e8379541af4e31359cca9fbcf4b00f2671dba205 + metrics: + - type: accuracy + value: 76.23880597014924 + - type: ap + value: 39.07351965022687 + - type: f1 + value: 70.04836733862683 + - task: + type: Classification + dataset: + type: mteb/amazon_counterfactual + name: MTEB AmazonCounterfactualClassification (de) + config: de + split: test + revision: e8379541af4e31359cca9fbcf4b00f2671dba205 + metrics: + - type: accuracy + value: 66.71306209850107 + - type: ap + value: 79.01499914759529 + - type: f1 + value: 64.81951817560703 + - task: + type: Classification + dataset: + type: mteb/amazon_counterfactual + name: MTEB AmazonCounterfactualClassification (en-ext) + config: en-ext + split: test + revision: e8379541af4e31359cca9fbcf4b00f2671dba205 + metrics: + - type: accuracy + value: 73.85307346326837 + - type: ap + value: 22.447519885878737 + - type: f1 + value: 61.0162730745633 + - task: + type: Classification + dataset: + type: mteb/amazon_counterfactual + name: MTEB AmazonCounterfactualClassification (ja) + config: ja + split: test + revision: e8379541af4e31359cca9fbcf4b00f2671dba205 + metrics: + - type: accuracy + value: 76.04925053533191 + - type: ap + value: 23.44983217128922 + - type: f1 + value: 62.5723230907759 + - task: + type: Classification + dataset: + type: mteb/amazon_polarity + name: MTEB AmazonPolarityClassification + config: default + split: test + revision: e2d317d38cd51312af73b3d32a06d1a08b442046 + metrics: + - type: accuracy + value: 96.28742500000001 + - type: ap + value: 94.8449918887462 + - type: f1 + value: 96.28680923610432 + - task: + type: Classification + dataset: + type: mteb/amazon_reviews_multi + name: MTEB AmazonReviewsClassification (en) + config: en + split: test + revision: 1399c76144fd37290681b995c656ef9b2e06e26d + metrics: + - type: accuracy + value: 56.716 + - type: f1 + value: 55.76510398266401 + - task: + type: Classification + dataset: + type: mteb/amazon_reviews_multi + name: MTEB AmazonReviewsClassification (de) + config: de + split: test + revision: 1399c76144fd37290681b995c656ef9b2e06e26d + metrics: + - type: accuracy + value: 52.99999999999999 + - type: f1 + value: 52.00829994765178 + - task: + type: Classification + dataset: + type: mteb/amazon_reviews_multi + name: MTEB AmazonReviewsClassification (es) + config: es + split: test + revision: 1399c76144fd37290681b995c656ef9b2e06e26d + metrics: + - type: accuracy + value: 48.806000000000004 + - type: f1 + value: 48.082345914983634 + - task: + type: Classification + dataset: + type: mteb/amazon_reviews_multi + name: MTEB AmazonReviewsClassification (fr) + config: fr + split: test + revision: 1399c76144fd37290681b995c656ef9b2e06e26d + metrics: + - type: accuracy + value: 48.507999999999996 + - type: f1 + value: 47.68752844642045 + - task: + type: Classification + dataset: + type: mteb/amazon_reviews_multi + name: MTEB AmazonReviewsClassification (ja) + config: ja + split: test + revision: 1399c76144fd37290681b995c656ef9b2e06e26d + metrics: + - type: accuracy + value: 47.709999999999994 + - type: f1 + value: 47.05870376637181 + - task: + type: Classification + dataset: + type: mteb/amazon_reviews_multi + name: MTEB AmazonReviewsClassification (zh) + config: zh + split: test + revision: 1399c76144fd37290681b995c656ef9b2e06e26d + metrics: + - type: accuracy + value: 44.662000000000006 + - type: f1 + value: 43.42371965372771 + - task: + type: Retrieval + dataset: + type: arguana + name: MTEB ArguAna + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 31.721 + - type: map_at_10 + value: 49.221 + - type: map_at_100 + value: 49.884 + - type: map_at_1000 + value: 49.888 + - type: map_at_3 + value: 44.31 + - type: map_at_5 + value: 47.276 + - type: mrr_at_1 + value: 32.432 + - type: mrr_at_10 + value: 49.5 + - type: mrr_at_100 + value: 50.163000000000004 + - type: mrr_at_1000 + value: 50.166 + - type: mrr_at_3 + value: 44.618 + - type: mrr_at_5 + value: 47.541 + - type: ndcg_at_1 + value: 31.721 + - type: ndcg_at_10 + value: 58.384 + - type: ndcg_at_100 + value: 61.111000000000004 + - type: ndcg_at_1000 + value: 61.187999999999995 + - type: ndcg_at_3 + value: 48.386 + - type: ndcg_at_5 + value: 53.708999999999996 + - type: precision_at_1 + value: 31.721 + - type: precision_at_10 + value: 8.741 + - type: precision_at_100 + value: 0.991 + - type: precision_at_1000 + value: 0.1 + - type: precision_at_3 + value: 20.057 + - type: precision_at_5 + value: 14.609 + - type: recall_at_1 + value: 31.721 + - type: recall_at_10 + value: 87.411 + - type: recall_at_100 + value: 99.075 + - type: recall_at_1000 + value: 99.644 + - type: recall_at_3 + value: 60.171 + - type: recall_at_5 + value: 73.044 + - task: + type: Clustering + dataset: + type: mteb/arxiv-clustering-p2p + name: MTEB ArxivClusteringP2P + config: default + split: test + revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d + metrics: + - type: v_measure + value: 46.40419580759799 + - task: + type: Clustering + dataset: + type: mteb/arxiv-clustering-s2s + name: MTEB ArxivClusteringS2S + config: default + split: test + revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53 + metrics: + - type: v_measure + value: 40.48593255007969 + - task: + type: Reranking + dataset: + type: mteb/askubuntudupquestions-reranking + name: MTEB AskUbuntuDupQuestions + config: default + split: test + revision: 2000358ca161889fa9c082cb41daa8dcfb161a54 + metrics: + - type: map + value: 63.889179122289995 + - type: mrr + value: 77.61146286769556 + - task: + type: STS + dataset: + type: mteb/biosses-sts + name: MTEB BIOSSES + config: default + split: test + revision: d3fb88f8f02e40887cd149695127462bbcf29b4a + metrics: + - type: cos_sim_pearson + value: 88.15075203727929 + - type: cos_sim_spearman + value: 86.9622224570873 + - type: euclidean_pearson + value: 86.70473853624121 + - type: euclidean_spearman + value: 86.9622224570873 + - type: manhattan_pearson + value: 86.21089380980065 + - type: manhattan_spearman + value: 86.75318154937008 + - task: + type: BitextMining + dataset: + type: mteb/bucc-bitext-mining + name: MTEB BUCC (de-en) + config: de-en + split: test + revision: d51519689f32196a32af33b075a01d0e7c51e252 + metrics: + - type: accuracy + value: 99.65553235908142 + - type: f1 + value: 99.60681976339595 + - type: precision + value: 99.58246346555325 + - type: recall + value: 99.65553235908142 + - task: + type: BitextMining + dataset: + type: mteb/bucc-bitext-mining + name: MTEB BUCC (fr-en) + config: fr-en + split: test + revision: d51519689f32196a32af33b075a01d0e7c51e252 + metrics: + - type: accuracy + value: 99.26260180497468 + - type: f1 + value: 99.14520507740848 + - type: precision + value: 99.08650671362535 + - type: recall + value: 99.26260180497468 + - task: + type: BitextMining + dataset: + type: mteb/bucc-bitext-mining + name: MTEB BUCC (ru-en) + config: ru-en + split: test + revision: d51519689f32196a32af33b075a01d0e7c51e252 + metrics: + - type: accuracy + value: 98.07412538967787 + - type: f1 + value: 97.86629719431936 + - type: precision + value: 97.76238309664012 + - type: recall + value: 98.07412538967787 + - task: + type: BitextMining + dataset: + type: mteb/bucc-bitext-mining + name: MTEB BUCC (zh-en) + config: zh-en + split: test + revision: d51519689f32196a32af33b075a01d0e7c51e252 + metrics: + - type: accuracy + value: 99.42074776197998 + - type: f1 + value: 99.38564156573635 + - type: precision + value: 99.36808846761454 + - type: recall + value: 99.42074776197998 + - task: + type: Classification + dataset: + type: mteb/banking77 + name: MTEB Banking77Classification + config: default + split: test + revision: 0fd18e25b25c072e09e0d92ab615fda904d66300 + metrics: + - type: accuracy + value: 85.73376623376623 + - type: f1 + value: 85.68480707214599 + - task: + type: Clustering + dataset: + type: mteb/biorxiv-clustering-p2p + name: MTEB BiorxivClusteringP2P + config: default + split: test + revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40 + metrics: + - type: v_measure + value: 40.935218072113855 + - task: + type: Clustering + dataset: + type: mteb/biorxiv-clustering-s2s + name: MTEB BiorxivClusteringS2S + config: default + split: test + revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908 + metrics: + - type: v_measure + value: 36.276389017675264 + - task: + type: Retrieval + dataset: + type: BeIR/cqadupstack + name: MTEB CQADupstackRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 27.764166666666668 + - type: map_at_10 + value: 37.298166666666674 + - type: map_at_100 + value: 38.530166666666666 + - type: map_at_1000 + value: 38.64416666666667 + - type: map_at_3 + value: 34.484833333333334 + - type: map_at_5 + value: 36.0385 + - type: mrr_at_1 + value: 32.93558333333333 + - type: mrr_at_10 + value: 41.589749999999995 + - type: mrr_at_100 + value: 42.425333333333334 + - type: mrr_at_1000 + value: 42.476333333333336 + - type: mrr_at_3 + value: 39.26825 + - type: mrr_at_5 + value: 40.567083333333336 + - type: ndcg_at_1 + value: 32.93558333333333 + - type: ndcg_at_10 + value: 42.706583333333334 + - type: ndcg_at_100 + value: 47.82483333333333 + - type: ndcg_at_1000 + value: 49.95733333333334 + - type: ndcg_at_3 + value: 38.064750000000004 + - type: ndcg_at_5 + value: 40.18158333333333 + - type: precision_at_1 + value: 32.93558333333333 + - type: precision_at_10 + value: 7.459833333333334 + - type: precision_at_100 + value: 1.1830833333333335 + - type: precision_at_1000 + value: 0.15608333333333332 + - type: precision_at_3 + value: 17.5235 + - type: precision_at_5 + value: 12.349833333333333 + - type: recall_at_1 + value: 27.764166666666668 + - type: recall_at_10 + value: 54.31775 + - type: recall_at_100 + value: 76.74350000000001 + - type: recall_at_1000 + value: 91.45208333333332 + - type: recall_at_3 + value: 41.23425 + - type: recall_at_5 + value: 46.73983333333334 + - task: + type: Retrieval + dataset: + type: climate-fever + name: MTEB ClimateFEVER + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 12.969 + - type: map_at_10 + value: 21.584999999999997 + - type: map_at_100 + value: 23.3 + - type: map_at_1000 + value: 23.5 + - type: map_at_3 + value: 18.218999999999998 + - type: map_at_5 + value: 19.983 + - type: mrr_at_1 + value: 29.316 + - type: mrr_at_10 + value: 40.033 + - type: mrr_at_100 + value: 40.96 + - type: mrr_at_1000 + value: 41.001 + - type: mrr_at_3 + value: 37.123 + - type: mrr_at_5 + value: 38.757999999999996 + - type: ndcg_at_1 + value: 29.316 + - type: ndcg_at_10 + value: 29.858 + - type: ndcg_at_100 + value: 36.756 + - type: ndcg_at_1000 + value: 40.245999999999995 + - type: ndcg_at_3 + value: 24.822 + - type: ndcg_at_5 + value: 26.565 + - type: precision_at_1 + value: 29.316 + - type: precision_at_10 + value: 9.186 + - type: precision_at_100 + value: 1.6549999999999998 + - type: precision_at_1000 + value: 0.22999999999999998 + - type: precision_at_3 + value: 18.436 + - type: precision_at_5 + value: 13.876 + - type: recall_at_1 + value: 12.969 + - type: recall_at_10 + value: 35.142 + - type: recall_at_100 + value: 59.143 + - type: recall_at_1000 + value: 78.594 + - type: recall_at_3 + value: 22.604 + - type: recall_at_5 + value: 27.883000000000003 + - task: + type: Retrieval + dataset: + type: dbpedia-entity + name: MTEB DBPedia + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 8.527999999999999 + - type: map_at_10 + value: 17.974999999999998 + - type: map_at_100 + value: 25.665 + - type: map_at_1000 + value: 27.406000000000002 + - type: map_at_3 + value: 13.017999999999999 + - type: map_at_5 + value: 15.137 + - type: mrr_at_1 + value: 62.5 + - type: mrr_at_10 + value: 71.891 + - type: mrr_at_100 + value: 72.294 + - type: mrr_at_1000 + value: 72.296 + - type: mrr_at_3 + value: 69.958 + - type: mrr_at_5 + value: 71.121 + - type: ndcg_at_1 + value: 50.875 + - type: ndcg_at_10 + value: 38.36 + - type: ndcg_at_100 + value: 44.235 + - type: ndcg_at_1000 + value: 52.154 + - type: ndcg_at_3 + value: 43.008 + - type: ndcg_at_5 + value: 40.083999999999996 + - type: precision_at_1 + value: 62.5 + - type: precision_at_10 + value: 30.0 + - type: precision_at_100 + value: 10.038 + - type: precision_at_1000 + value: 2.0869999999999997 + - type: precision_at_3 + value: 46.833000000000006 + - type: precision_at_5 + value: 38.800000000000004 + - type: recall_at_1 + value: 8.527999999999999 + - type: recall_at_10 + value: 23.828 + - type: recall_at_100 + value: 52.322 + - type: recall_at_1000 + value: 77.143 + - type: recall_at_3 + value: 14.136000000000001 + - type: recall_at_5 + value: 17.761 + - task: + type: Classification + dataset: + type: mteb/emotion + name: MTEB EmotionClassification + config: default + split: test + revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37 + metrics: + - type: accuracy + value: 51.51 + - type: f1 + value: 47.632159862049896 + - task: + type: Retrieval + dataset: + type: fever + name: MTEB FEVER + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 60.734 + - type: map_at_10 + value: 72.442 + - type: map_at_100 + value: 72.735 + - type: map_at_1000 + value: 72.75 + - type: map_at_3 + value: 70.41199999999999 + - type: map_at_5 + value: 71.80499999999999 + - type: mrr_at_1 + value: 65.212 + - type: mrr_at_10 + value: 76.613 + - type: mrr_at_100 + value: 76.79899999999999 + - type: mrr_at_1000 + value: 76.801 + - type: mrr_at_3 + value: 74.8 + - type: mrr_at_5 + value: 76.12400000000001 + - type: ndcg_at_1 + value: 65.212 + - type: ndcg_at_10 + value: 77.988 + - type: ndcg_at_100 + value: 79.167 + - type: ndcg_at_1000 + value: 79.452 + - type: ndcg_at_3 + value: 74.362 + - type: ndcg_at_5 + value: 76.666 + - type: precision_at_1 + value: 65.212 + - type: precision_at_10 + value: 10.003 + - type: precision_at_100 + value: 1.077 + - type: precision_at_1000 + value: 0.11199999999999999 + - type: precision_at_3 + value: 29.518 + - type: precision_at_5 + value: 19.016 + - type: recall_at_1 + value: 60.734 + - type: recall_at_10 + value: 90.824 + - type: recall_at_100 + value: 95.71600000000001 + - type: recall_at_1000 + value: 97.577 + - type: recall_at_3 + value: 81.243 + - type: recall_at_5 + value: 86.90299999999999 + - task: + type: Retrieval + dataset: + type: fiqa + name: MTEB FiQA2018 + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 23.845 + - type: map_at_10 + value: 39.281 + - type: map_at_100 + value: 41.422 + - type: map_at_1000 + value: 41.593 + - type: map_at_3 + value: 34.467 + - type: map_at_5 + value: 37.017 + - type: mrr_at_1 + value: 47.531 + - type: mrr_at_10 + value: 56.204 + - type: mrr_at_100 + value: 56.928999999999995 + - type: mrr_at_1000 + value: 56.962999999999994 + - type: mrr_at_3 + value: 54.115 + - type: mrr_at_5 + value: 55.373000000000005 + - type: ndcg_at_1 + value: 47.531 + - type: ndcg_at_10 + value: 47.711999999999996 + - type: ndcg_at_100 + value: 54.510999999999996 + - type: ndcg_at_1000 + value: 57.103 + - type: ndcg_at_3 + value: 44.145 + - type: ndcg_at_5 + value: 45.032 + - type: precision_at_1 + value: 47.531 + - type: precision_at_10 + value: 13.194 + - type: precision_at_100 + value: 2.045 + - type: precision_at_1000 + value: 0.249 + - type: precision_at_3 + value: 29.424 + - type: precision_at_5 + value: 21.451 + - type: recall_at_1 + value: 23.845 + - type: recall_at_10 + value: 54.967 + - type: recall_at_100 + value: 79.11399999999999 + - type: recall_at_1000 + value: 94.56700000000001 + - type: recall_at_3 + value: 40.256 + - type: recall_at_5 + value: 46.215 + - task: + type: Retrieval + dataset: + type: hotpotqa + name: MTEB HotpotQA + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 37.819 + - type: map_at_10 + value: 60.889 + - type: map_at_100 + value: 61.717999999999996 + - type: map_at_1000 + value: 61.778 + - type: map_at_3 + value: 57.254000000000005 + - type: map_at_5 + value: 59.541 + - type: mrr_at_1 + value: 75.638 + - type: mrr_at_10 + value: 82.173 + - type: mrr_at_100 + value: 82.362 + - type: mrr_at_1000 + value: 82.37 + - type: mrr_at_3 + value: 81.089 + - type: mrr_at_5 + value: 81.827 + - type: ndcg_at_1 + value: 75.638 + - type: ndcg_at_10 + value: 69.317 + - type: ndcg_at_100 + value: 72.221 + - type: ndcg_at_1000 + value: 73.382 + - type: ndcg_at_3 + value: 64.14 + - type: ndcg_at_5 + value: 67.07600000000001 + - type: precision_at_1 + value: 75.638 + - type: precision_at_10 + value: 14.704999999999998 + - type: precision_at_100 + value: 1.698 + - type: precision_at_1000 + value: 0.185 + - type: precision_at_3 + value: 41.394999999999996 + - type: precision_at_5 + value: 27.162999999999997 + - type: recall_at_1 + value: 37.819 + - type: recall_at_10 + value: 73.52499999999999 + - type: recall_at_100 + value: 84.875 + - type: recall_at_1000 + value: 92.559 + - type: recall_at_3 + value: 62.092999999999996 + - type: recall_at_5 + value: 67.907 + - task: + type: Classification + dataset: + type: mteb/imdb + name: MTEB ImdbClassification + config: default + split: test + revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7 + metrics: + - type: accuracy + value: 94.60079999999999 + - type: ap + value: 92.67396345347356 + - type: f1 + value: 94.5988098167121 + - task: + type: Retrieval + dataset: + type: msmarco + name: MTEB MSMARCO + config: default + split: dev + revision: None + metrics: + - type: map_at_1 + value: 21.285 + - type: map_at_10 + value: 33.436 + - type: map_at_100 + value: 34.63 + - type: map_at_1000 + value: 34.681 + - type: map_at_3 + value: 29.412 + - type: map_at_5 + value: 31.715 + - type: mrr_at_1 + value: 21.848 + - type: mrr_at_10 + value: 33.979 + - type: mrr_at_100 + value: 35.118 + - type: mrr_at_1000 + value: 35.162 + - type: mrr_at_3 + value: 30.036 + - type: mrr_at_5 + value: 32.298 + - type: ndcg_at_1 + value: 21.862000000000002 + - type: ndcg_at_10 + value: 40.43 + - type: ndcg_at_100 + value: 46.17 + - type: ndcg_at_1000 + value: 47.412 + - type: ndcg_at_3 + value: 32.221 + - type: ndcg_at_5 + value: 36.332 + - type: precision_at_1 + value: 21.862000000000002 + - type: precision_at_10 + value: 6.491 + - type: precision_at_100 + value: 0.935 + - type: precision_at_1000 + value: 0.104 + - type: precision_at_3 + value: 13.744 + - type: precision_at_5 + value: 10.331999999999999 + - type: recall_at_1 + value: 21.285 + - type: recall_at_10 + value: 62.083 + - type: recall_at_100 + value: 88.576 + - type: recall_at_1000 + value: 98.006 + - type: recall_at_3 + value: 39.729 + - type: recall_at_5 + value: 49.608000000000004 + - task: + type: Classification + dataset: + type: mteb/mtop_domain + name: MTEB MTOPDomainClassification (en) + config: en + split: test + revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf + metrics: + - type: accuracy + value: 93.92612859097127 + - type: f1 + value: 93.82370333372853 + - task: + type: Classification + dataset: + type: mteb/mtop_domain + name: MTEB MTOPDomainClassification (de) + config: de + split: test + revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf + metrics: + - type: accuracy + value: 92.67681036911807 + - type: f1 + value: 92.14191382411472 + - task: + type: Classification + dataset: + type: mteb/mtop_domain + name: MTEB MTOPDomainClassification (es) + config: es + split: test + revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf + metrics: + - type: accuracy + value: 92.26817878585723 + - type: f1 + value: 91.92824250337878 + - task: + type: Classification + dataset: + type: mteb/mtop_domain + name: MTEB MTOPDomainClassification (fr) + config: fr + split: test + revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf + metrics: + - type: accuracy + value: 89.96554963983714 + - type: f1 + value: 90.02859329630792 + - task: + type: Classification + dataset: + type: mteb/mtop_domain + name: MTEB MTOPDomainClassification (hi) + config: hi + split: test + revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf + metrics: + - type: accuracy + value: 90.02509860164935 + - type: f1 + value: 89.30665159182062 + - task: + type: Classification + dataset: + type: mteb/mtop_domain + name: MTEB MTOPDomainClassification (th) + config: th + split: test + revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf + metrics: + - type: accuracy + value: 87.55515370705244 + - type: f1 + value: 87.94449232331907 + - task: + type: Classification + dataset: + type: mteb/mtop_intent + name: MTEB MTOPIntentClassification (en) + config: en + split: test + revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba + metrics: + - type: accuracy + value: 82.4623803009576 + - type: f1 + value: 66.06738378772725 + - task: + type: Classification + dataset: + type: mteb/mtop_intent + name: MTEB MTOPIntentClassification (de) + config: de + split: test + revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba + metrics: + - type: accuracy + value: 79.3716539870386 + - type: f1 + value: 60.37614033396853 + - task: + type: Classification + dataset: + type: mteb/mtop_intent + name: MTEB MTOPIntentClassification (es) + config: es + split: test + revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba + metrics: + - type: accuracy + value: 80.34022681787857 + - type: f1 + value: 58.302008026952 + - task: + type: Classification + dataset: + type: mteb/mtop_intent + name: MTEB MTOPIntentClassification (fr) + config: fr + split: test + revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba + metrics: + - type: accuracy + value: 76.72095208268087 + - type: f1 + value: 59.64524724009049 + - task: + type: Classification + dataset: + type: mteb/mtop_intent + name: MTEB MTOPIntentClassification (hi) + config: hi + split: test + revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba + metrics: + - type: accuracy + value: 77.87020437432773 + - type: f1 + value: 57.80202694670567 + - task: + type: Classification + dataset: + type: mteb/mtop_intent + name: MTEB MTOPIntentClassification (th) + config: th + split: test + revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba + metrics: + - type: accuracy + value: 77.73598553345387 + - type: f1 + value: 58.19628250675031 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (af) + config: af + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 67.6630800268998 + - type: f1 + value: 65.00996668051691 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (am) + config: am + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 60.7128446536651 + - type: f1 + value: 57.95860594874963 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ar) + config: ar + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 63.61129791526563 + - type: f1 + value: 59.75328290206483 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (az) + config: az + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 69.00134498991257 + - type: f1 + value: 67.0230483991802 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (bn) + config: bn + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 68.54068594485541 + - type: f1 + value: 65.54604628946976 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (cy) + config: cy + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 63.032952252858095 + - type: f1 + value: 58.715741857057104 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (da) + config: da + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 71.80901143241427 + - type: f1 + value: 68.33963989243877 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (de) + config: de + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 72.47141896435777 + - type: f1 + value: 69.56765020308262 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (el) + config: el + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 71.2373907195696 + - type: f1 + value: 69.04529836036467 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (en) + config: en + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 77.05783456624076 + - type: f1 + value: 74.69430584708174 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (es) + config: es + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 72.82111634162744 + - type: f1 + value: 70.77228952803762 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (fa) + config: fa + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 74.25353059852051 + - type: f1 + value: 71.05310103416411 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (fi) + config: fi + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 72.28648285137861 + - type: f1 + value: 69.08020473732226 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (fr) + config: fr + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 73.31540013449899 + - type: f1 + value: 70.9426355465791 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (he) + config: he + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 70.2151983860121 + - type: f1 + value: 67.52541755908858 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (hi) + config: hi + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 71.58372562205784 + - type: f1 + value: 69.49769064229827 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (hu) + config: hu + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 71.9233355749832 + - type: f1 + value: 69.36311548259593 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (hy) + config: hy + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 68.07330195023538 + - type: f1 + value: 64.99882022345572 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (id) + config: id + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 72.62273032952253 + - type: f1 + value: 70.6394885471001 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (is) + config: is + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 65.77000672494957 + - type: f1 + value: 62.9368944815065 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (it) + config: it + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 73.453261600538 + - type: f1 + value: 70.85069934666681 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ja) + config: ja + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 74.6906523201076 + - type: f1 + value: 72.03249740074217 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (jv) + config: jv + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 63.03631472763953 + - type: f1 + value: 59.3165215571852 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ka) + config: ka + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 58.913920645595155 + - type: f1 + value: 57.367337711611285 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (km) + config: km + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 54.42837928715535 + - type: f1 + value: 52.60527294970906 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (kn) + config: kn + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 66.33490248823135 + - type: f1 + value: 63.213340969404065 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ko) + config: ko + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 70.58507061197041 + - type: f1 + value: 68.40256628040486 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (lv) + config: lv + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 69.11230665770006 + - type: f1 + value: 66.44863577842305 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ml) + config: ml + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 69.70073974445192 + - type: f1 + value: 67.21291337273702 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (mn) + config: mn + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 66.43913920645595 + - type: f1 + value: 64.09838087422806 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ms) + config: ms + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 70.80026899798251 + - type: f1 + value: 68.76986742962444 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (my) + config: my + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 64.78816408876934 + - type: f1 + value: 62.18781873428972 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (nb) + config: nb + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 71.6577000672495 + - type: f1 + value: 68.75171511133003 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (nl) + config: nl + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 74.42501681237391 + - type: f1 + value: 71.18434963451544 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (pl) + config: pl + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 73.64828513786146 + - type: f1 + value: 70.67741914007422 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (pt) + config: pt + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 73.62811028917284 + - type: f1 + value: 71.36402039740959 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ro) + config: ro + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 71.88634835238736 + - type: f1 + value: 69.23701923480677 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ru) + config: ru + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 74.15938130464022 + - type: f1 + value: 71.87792218993388 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (sl) + config: sl + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 69.96301277740416 + - type: f1 + value: 67.29584200202983 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (sq) + config: sq + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 69.49562878278412 + - type: f1 + value: 66.91716685679431 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (sv) + config: sv + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 74.6805648957633 + - type: f1 + value: 72.02723592594374 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (sw) + config: sw + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 63.00605245460659 + - type: f1 + value: 60.16716669482932 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ta) + config: ta + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 66.90988567585742 + - type: f1 + value: 63.99405488777784 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (te) + config: te + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 67.62273032952253 + - type: f1 + value: 65.17213906909481 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (th) + config: th + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 69.50907868190988 + - type: f1 + value: 69.15165697194853 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (tl) + config: tl + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 69.30733019502352 + - type: f1 + value: 66.69024007380474 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (tr) + config: tr + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 72.24277067921989 + - type: f1 + value: 68.80515408492947 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (ur) + config: ur + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 67.49831876260929 + - type: f1 + value: 64.83778567111116 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (vi) + config: vi + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 71.28782784129119 + - type: f1 + value: 69.3294186700733 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (zh-CN) + config: zh-CN + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 73.315400134499 + - type: f1 + value: 71.22674385243207 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_intent + name: MTEB MassiveIntentClassification (zh-TW) + config: zh-TW + split: test + revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7 + metrics: + - type: accuracy + value: 69.37794216543377 + - type: f1 + value: 68.96962492838232 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (af) + config: af + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 73.33557498318764 + - type: f1 + value: 72.28949738478356 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (am) + config: am + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 65.84398117014123 + - type: f1 + value: 64.71026362091463 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ar) + config: ar + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 69.76462676529925 + - type: f1 + value: 69.8229667407667 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (az) + config: az + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 72.02420981842636 + - type: f1 + value: 71.76576384895898 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (bn) + config: bn + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 72.7572293207801 + - type: f1 + value: 72.76840765295256 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (cy) + config: cy + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 68.02286482851379 + - type: f1 + value: 66.17237947327872 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (da) + config: da + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.60928043039678 + - type: f1 + value: 77.27094731234773 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (de) + config: de + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.68325487558843 + - type: f1 + value: 77.97530399082261 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (el) + config: el + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 76.13315400134498 + - type: f1 + value: 75.97558584796424 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (en) + config: en + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 80.47410894418292 + - type: f1 + value: 80.52244841473792 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (es) + config: es + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 76.9670477471419 + - type: f1 + value: 77.37318805793146 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (fa) + config: fa + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 78.09683927370544 + - type: f1 + value: 77.69773737430847 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (fi) + config: fi + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 75.20847343644922 + - type: f1 + value: 75.17071738727348 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (fr) + config: fr + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.07464694014796 + - type: f1 + value: 77.16136207698571 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (he) + config: he + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 73.53396099529255 + - type: f1 + value: 73.58296404484122 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (hi) + config: hi + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 75.75319435104237 + - type: f1 + value: 75.24674707850833 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (hu) + config: hu + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.0948217888366 + - type: f1 + value: 76.47559490205028 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (hy) + config: hy + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 71.07599193006052 + - type: f1 + value: 70.76028043093511 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (id) + config: id + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.10490921318089 + - type: f1 + value: 77.01215275283272 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (is) + config: is + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 71.25756556825824 + - type: f1 + value: 70.20605314648762 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (it) + config: it + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.08137188971082 + - type: f1 + value: 77.3899269057439 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ja) + config: ja + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 79.35440484196369 + - type: f1 + value: 79.58964690002772 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (jv) + config: jv + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 68.42299932750504 + - type: f1 + value: 68.07844356925413 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ka) + config: ka + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 66.15669132481507 + - type: f1 + value: 65.89383352608513 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (km) + config: km + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 60.11432414256894 + - type: f1 + value: 57.69910594559806 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (kn) + config: kn + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 71.24747814391392 + - type: f1 + value: 70.42455553830918 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ko) + config: ko + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 76.46267652992603 + - type: f1 + value: 76.8854559308316 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (lv) + config: lv + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 73.24815063887021 + - type: f1 + value: 72.77805034658074 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ml) + config: ml + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 74.11566913248151 + - type: f1 + value: 73.86147988001356 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (mn) + config: mn + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 70.0168123739072 + - type: f1 + value: 69.38515920054571 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ms) + config: ms + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 74.41156691324814 + - type: f1 + value: 73.43474953408237 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (my) + config: my + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 68.39609952925353 + - type: f1 + value: 67.29731681109291 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (nb) + config: nb + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.20914593140552 + - type: f1 + value: 77.07066497935367 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (nl) + config: nl + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 78.52387357094821 + - type: f1 + value: 78.5259569473291 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (pl) + config: pl + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 76.6913248150639 + - type: f1 + value: 76.91201656350455 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (pt) + config: pt + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.1217215870881 + - type: f1 + value: 77.41179937912504 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ro) + config: ro + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 75.25891055817083 + - type: f1 + value: 75.8089244542887 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ru) + config: ru + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 77.70679219905851 + - type: f1 + value: 78.21459594517711 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (sl) + config: sl + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 74.83523873570948 + - type: f1 + value: 74.86847028401978 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (sq) + config: sq + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 74.71755211835911 + - type: f1 + value: 74.0214326485662 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (sv) + config: sv + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 79.06523201075991 + - type: f1 + value: 79.10545620325138 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (sw) + config: sw + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 67.91862811028918 + - type: f1 + value: 66.50386121217983 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ta) + config: ta + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 70.93140551445865 + - type: f1 + value: 70.755435928495 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (te) + config: te + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 72.40753194351042 + - type: f1 + value: 71.61816115782923 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (th) + config: th + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 75.1815736381977 + - type: f1 + value: 75.08016717887205 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (tl) + config: tl + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 72.86482851378614 + - type: f1 + value: 72.39521180006291 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (tr) + config: tr + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 76.46940147948891 + - type: f1 + value: 76.70044085362349 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (ur) + config: ur + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 71.89307330195024 + - type: f1 + value: 71.5721825332298 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (vi) + config: vi + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 74.7511768661735 + - type: f1 + value: 75.17918654541515 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (zh-CN) + config: zh-CN + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 78.69535978480162 + - type: f1 + value: 78.90019070153316 + - task: + type: Classification + dataset: + type: mteb/amazon_massive_scenario + name: MTEB MassiveScenarioClassification (zh-TW) + config: zh-TW + split: test + revision: 7d571f92784cd94a019292a1f45445077d0ef634 + metrics: + - type: accuracy + value: 75.45729657027572 + - type: f1 + value: 76.19578371794672 + - task: + type: Clustering + dataset: + type: mteb/medrxiv-clustering-p2p + name: MTEB MedrxivClusteringP2P + config: default + split: test + revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73 + metrics: + - type: v_measure + value: 36.92715354123554 + - task: + type: Clustering + dataset: + type: mteb/medrxiv-clustering-s2s + name: MTEB MedrxivClusteringS2S + config: default + split: test + revision: 35191c8c0dca72d8ff3efcd72aa802307d469663 + metrics: + - type: v_measure + value: 35.53536244162518 + - task: + type: Reranking + dataset: + type: mteb/mind_small + name: MTEB MindSmallReranking + config: default + split: test + revision: 3bdac13927fdc888b903db93b2ffdbd90b295a69 + metrics: + - type: map + value: 33.08507884504006 + - type: mrr + value: 34.32436977159129 + - task: + type: Retrieval + dataset: + type: nfcorpus + name: MTEB NFCorpus + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 5.935 + - type: map_at_10 + value: 13.297 + - type: map_at_100 + value: 16.907 + - type: map_at_1000 + value: 18.391 + - type: map_at_3 + value: 9.626999999999999 + - type: map_at_5 + value: 11.190999999999999 + - type: mrr_at_1 + value: 46.129999999999995 + - type: mrr_at_10 + value: 54.346000000000004 + - type: mrr_at_100 + value: 55.067 + - type: mrr_at_1000 + value: 55.1 + - type: mrr_at_3 + value: 51.961 + - type: mrr_at_5 + value: 53.246 + - type: ndcg_at_1 + value: 44.118 + - type: ndcg_at_10 + value: 35.534 + - type: ndcg_at_100 + value: 32.946999999999996 + - type: ndcg_at_1000 + value: 41.599000000000004 + - type: ndcg_at_3 + value: 40.25 + - type: ndcg_at_5 + value: 37.978 + - type: precision_at_1 + value: 46.129999999999995 + - type: precision_at_10 + value: 26.842 + - type: precision_at_100 + value: 8.427 + - type: precision_at_1000 + value: 2.128 + - type: precision_at_3 + value: 37.977 + - type: precision_at_5 + value: 32.879000000000005 + - type: recall_at_1 + value: 5.935 + - type: recall_at_10 + value: 17.211000000000002 + - type: recall_at_100 + value: 34.33 + - type: recall_at_1000 + value: 65.551 + - type: recall_at_3 + value: 10.483 + - type: recall_at_5 + value: 13.078999999999999 + - task: + type: Retrieval + dataset: + type: nq + name: MTEB NQ + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 35.231 + - type: map_at_10 + value: 50.202000000000005 + - type: map_at_100 + value: 51.154999999999994 + - type: map_at_1000 + value: 51.181 + - type: map_at_3 + value: 45.774 + - type: map_at_5 + value: 48.522 + - type: mrr_at_1 + value: 39.687 + - type: mrr_at_10 + value: 52.88 + - type: mrr_at_100 + value: 53.569 + - type: mrr_at_1000 + value: 53.58500000000001 + - type: mrr_at_3 + value: 49.228 + - type: mrr_at_5 + value: 51.525 + - type: ndcg_at_1 + value: 39.687 + - type: ndcg_at_10 + value: 57.754000000000005 + - type: ndcg_at_100 + value: 61.597 + - type: ndcg_at_1000 + value: 62.18900000000001 + - type: ndcg_at_3 + value: 49.55 + - type: ndcg_at_5 + value: 54.11899999999999 + - type: precision_at_1 + value: 39.687 + - type: precision_at_10 + value: 9.313 + - type: precision_at_100 + value: 1.146 + - type: precision_at_1000 + value: 0.12 + - type: precision_at_3 + value: 22.229 + - type: precision_at_5 + value: 15.939 + - type: recall_at_1 + value: 35.231 + - type: recall_at_10 + value: 78.083 + - type: recall_at_100 + value: 94.42099999999999 + - type: recall_at_1000 + value: 98.81 + - type: recall_at_3 + value: 57.047000000000004 + - type: recall_at_5 + value: 67.637 + - task: + type: Retrieval + dataset: + type: quora + name: MTEB QuoraRetrieval + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 71.241 + - type: map_at_10 + value: 85.462 + - type: map_at_100 + value: 86.083 + - type: map_at_1000 + value: 86.09700000000001 + - type: map_at_3 + value: 82.49499999999999 + - type: map_at_5 + value: 84.392 + - type: mrr_at_1 + value: 82.09 + - type: mrr_at_10 + value: 88.301 + - type: mrr_at_100 + value: 88.383 + - type: mrr_at_1000 + value: 88.384 + - type: mrr_at_3 + value: 87.37 + - type: mrr_at_5 + value: 88.035 + - type: ndcg_at_1 + value: 82.12 + - type: ndcg_at_10 + value: 89.149 + - type: ndcg_at_100 + value: 90.235 + - type: ndcg_at_1000 + value: 90.307 + - type: ndcg_at_3 + value: 86.37599999999999 + - type: ndcg_at_5 + value: 87.964 + - type: precision_at_1 + value: 82.12 + - type: precision_at_10 + value: 13.56 + - type: precision_at_100 + value: 1.539 + - type: precision_at_1000 + value: 0.157 + - type: precision_at_3 + value: 37.88 + - type: precision_at_5 + value: 24.92 + - type: recall_at_1 + value: 71.241 + - type: recall_at_10 + value: 96.128 + - type: recall_at_100 + value: 99.696 + - type: recall_at_1000 + value: 99.994 + - type: recall_at_3 + value: 88.181 + - type: recall_at_5 + value: 92.694 + - task: + type: Clustering + dataset: + type: mteb/reddit-clustering + name: MTEB RedditClustering + config: default + split: test + revision: 24640382cdbf8abc73003fb0fa6d111a705499eb + metrics: + - type: v_measure + value: 56.59757799655151 + - task: + type: Clustering + dataset: + type: mteb/reddit-clustering-p2p + name: MTEB RedditClusteringP2P + config: default + split: test + revision: 282350215ef01743dc01b456c7f5241fa8937f16 + metrics: + - type: v_measure + value: 64.27391998854624 + - task: + type: Retrieval + dataset: + type: scidocs + name: MTEB SCIDOCS + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 4.243 + - type: map_at_10 + value: 10.965 + - type: map_at_100 + value: 12.934999999999999 + - type: map_at_1000 + value: 13.256 + - type: map_at_3 + value: 7.907 + - type: map_at_5 + value: 9.435 + - type: mrr_at_1 + value: 20.9 + - type: mrr_at_10 + value: 31.849 + - type: mrr_at_100 + value: 32.964 + - type: mrr_at_1000 + value: 33.024 + - type: mrr_at_3 + value: 28.517 + - type: mrr_at_5 + value: 30.381999999999998 + - type: ndcg_at_1 + value: 20.9 + - type: ndcg_at_10 + value: 18.723 + - type: ndcg_at_100 + value: 26.384999999999998 + - type: ndcg_at_1000 + value: 32.114 + - type: ndcg_at_3 + value: 17.753 + - type: ndcg_at_5 + value: 15.558 + - type: precision_at_1 + value: 20.9 + - type: precision_at_10 + value: 9.8 + - type: precision_at_100 + value: 2.078 + - type: precision_at_1000 + value: 0.345 + - type: precision_at_3 + value: 16.900000000000002 + - type: precision_at_5 + value: 13.88 + - type: recall_at_1 + value: 4.243 + - type: recall_at_10 + value: 19.885 + - type: recall_at_100 + value: 42.17 + - type: recall_at_1000 + value: 70.12 + - type: recall_at_3 + value: 10.288 + - type: recall_at_5 + value: 14.072000000000001 + - task: + type: STS + dataset: + type: mteb/sickr-sts + name: MTEB SICK-R + config: default + split: test + revision: a6ea5a8cab320b040a23452cc28066d9beae2cee + metrics: + - type: cos_sim_pearson + value: 85.84209174935282 + - type: cos_sim_spearman + value: 81.73248048438833 + - type: euclidean_pearson + value: 83.02810070308149 + - type: euclidean_spearman + value: 81.73248295679514 + - type: manhattan_pearson + value: 82.95368060376002 + - type: manhattan_spearman + value: 81.60277910998718 + - task: + type: STS + dataset: + type: mteb/sts12-sts + name: MTEB STS12 + config: default + split: test + revision: a0d554a64d88156834ff5ae9920b964011b16384 + metrics: + - type: cos_sim_pearson + value: 88.52628804556943 + - type: cos_sim_spearman + value: 82.5713913555672 + - type: euclidean_pearson + value: 85.8796774746988 + - type: euclidean_spearman + value: 82.57137506803424 + - type: manhattan_pearson + value: 85.79671002960058 + - type: manhattan_spearman + value: 82.49445981618027 + - task: + type: STS + dataset: + type: mteb/sts13-sts + name: MTEB STS13 + config: default + split: test + revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca + metrics: + - type: cos_sim_pearson + value: 86.23682503505542 + - type: cos_sim_spearman + value: 87.15008956711806 + - type: euclidean_pearson + value: 86.79805401524959 + - type: euclidean_spearman + value: 87.15008956711806 + - type: manhattan_pearson + value: 86.65298502699244 + - type: manhattan_spearman + value: 86.97677821948562 + - task: + type: STS + dataset: + type: mteb/sts14-sts + name: MTEB STS14 + config: default + split: test + revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375 + metrics: + - type: cos_sim_pearson + value: 85.63370304677802 + - type: cos_sim_spearman + value: 84.97105553540318 + - type: euclidean_pearson + value: 85.28896108687721 + - type: euclidean_spearman + value: 84.97105553540318 + - type: manhattan_pearson + value: 85.09663190337331 + - type: manhattan_spearman + value: 84.79126831644619 + - task: + type: STS + dataset: + type: mteb/sts15-sts + name: MTEB STS15 + config: default + split: test + revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3 + metrics: + - type: cos_sim_pearson + value: 90.2614838800733 + - type: cos_sim_spearman + value: 91.0509162991835 + - type: euclidean_pearson + value: 90.33098317533373 + - type: euclidean_spearman + value: 91.05091625871644 + - type: manhattan_pearson + value: 90.26250435151107 + - type: manhattan_spearman + value: 90.97999594417519 + - task: + type: STS + dataset: + type: mteb/sts16-sts + name: MTEB STS16 + config: default + split: test + revision: 4d8694f8f0e0100860b497b999b3dbed754a0513 + metrics: + - type: cos_sim_pearson + value: 85.80480973335091 + - type: cos_sim_spearman + value: 87.313695492969 + - type: euclidean_pearson + value: 86.49267251576939 + - type: euclidean_spearman + value: 87.313695492969 + - type: manhattan_pearson + value: 86.44019901831935 + - type: manhattan_spearman + value: 87.24205395460392 + - task: + type: STS + dataset: + type: mteb/sts17-crosslingual-sts + name: MTEB STS17 (en-en) + config: en-en + split: test + revision: af5e6fb845001ecf41f4c1e033ce921939a2a68d + metrics: + - type: cos_sim_pearson + value: 90.05662789380672 + - type: cos_sim_spearman + value: 90.02759424426651 + - type: euclidean_pearson + value: 90.4042483422981 + - type: euclidean_spearman + value: 90.02759424426651 + - type: manhattan_pearson + value: 90.51446975000226 + - type: manhattan_spearman + value: 90.08832889933616 + - task: + type: STS + dataset: + type: mteb/sts22-crosslingual-sts + name: MTEB STS22 (en) + config: en + split: test + revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80 + metrics: + - type: cos_sim_pearson + value: 67.5975528273532 + - type: cos_sim_spearman + value: 67.62969861411354 + - type: euclidean_pearson + value: 69.224275734323 + - type: euclidean_spearman + value: 67.62969861411354 + - type: manhattan_pearson + value: 69.3761447059927 + - type: manhattan_spearman + value: 67.90921005611467 + - task: + type: STS + dataset: + type: mteb/stsbenchmark-sts + name: MTEB STSBenchmark + config: default + split: test + revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831 + metrics: + - type: cos_sim_pearson + value: 87.11244327231684 + - type: cos_sim_spearman + value: 88.37902438979035 + - type: euclidean_pearson + value: 87.86054279847336 + - type: euclidean_spearman + value: 88.37902438979035 + - type: manhattan_pearson + value: 87.77257757320378 + - type: manhattan_spearman + value: 88.25208966098123 + - task: + type: Reranking + dataset: + type: mteb/scidocs-reranking + name: MTEB SciDocsRR + config: default + split: test + revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab + metrics: + - type: map + value: 85.87174608143563 + - type: mrr + value: 96.12836872640794 + - task: + type: Retrieval + dataset: + type: scifact + name: MTEB SciFact + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 57.760999999999996 + - type: map_at_10 + value: 67.258 + - type: map_at_100 + value: 67.757 + - type: map_at_1000 + value: 67.78800000000001 + - type: map_at_3 + value: 64.602 + - type: map_at_5 + value: 65.64 + - type: mrr_at_1 + value: 60.667 + - type: mrr_at_10 + value: 68.441 + - type: mrr_at_100 + value: 68.825 + - type: mrr_at_1000 + value: 68.853 + - type: mrr_at_3 + value: 66.444 + - type: mrr_at_5 + value: 67.26100000000001 + - type: ndcg_at_1 + value: 60.667 + - type: ndcg_at_10 + value: 71.852 + - type: ndcg_at_100 + value: 73.9 + - type: ndcg_at_1000 + value: 74.628 + - type: ndcg_at_3 + value: 67.093 + - type: ndcg_at_5 + value: 68.58 + - type: precision_at_1 + value: 60.667 + - type: precision_at_10 + value: 9.6 + - type: precision_at_100 + value: 1.0670000000000002 + - type: precision_at_1000 + value: 0.11199999999999999 + - type: precision_at_3 + value: 26.111 + - type: precision_at_5 + value: 16.733 + - type: recall_at_1 + value: 57.760999999999996 + - type: recall_at_10 + value: 84.967 + - type: recall_at_100 + value: 93.833 + - type: recall_at_1000 + value: 99.333 + - type: recall_at_3 + value: 71.589 + - type: recall_at_5 + value: 75.483 + - task: + type: PairClassification + dataset: + type: mteb/sprintduplicatequestions-pairclassification + name: MTEB SprintDuplicateQuestions + config: default + split: test + revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46 + metrics: + - type: cos_sim_accuracy + value: 99.66633663366336 + - type: cos_sim_ap + value: 91.17685358899108 + - type: cos_sim_f1 + value: 82.16818642350559 + - type: cos_sim_precision + value: 83.26488706365504 + - type: cos_sim_recall + value: 81.10000000000001 + - type: dot_accuracy + value: 99.66633663366336 + - type: dot_ap + value: 91.17663411119032 + - type: dot_f1 + value: 82.16818642350559 + - type: dot_precision + value: 83.26488706365504 + - type: dot_recall + value: 81.10000000000001 + - type: euclidean_accuracy + value: 99.66633663366336 + - type: euclidean_ap + value: 91.17685189882275 + - type: euclidean_f1 + value: 82.16818642350559 + - type: euclidean_precision + value: 83.26488706365504 + - type: euclidean_recall + value: 81.10000000000001 + - type: manhattan_accuracy + value: 99.66633663366336 + - type: manhattan_ap + value: 91.2241619496737 + - type: manhattan_f1 + value: 82.20472440944883 + - type: manhattan_precision + value: 86.51933701657458 + - type: manhattan_recall + value: 78.3 + - type: max_accuracy + value: 99.66633663366336 + - type: max_ap + value: 91.2241619496737 + - type: max_f1 + value: 82.20472440944883 + - task: + type: Clustering + dataset: + type: mteb/stackexchange-clustering + name: MTEB StackExchangeClustering + config: default + split: test + revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259 + metrics: + - type: v_measure + value: 66.85101268897951 + - task: + type: Clustering + dataset: + type: mteb/stackexchange-clustering-p2p + name: MTEB StackExchangeClusteringP2P + config: default + split: test + revision: 815ca46b2622cec33ccafc3735d572c266efdb44 + metrics: + - type: v_measure + value: 42.461184054706905 + - task: + type: Reranking + dataset: + type: mteb/stackoverflowdupquestions-reranking + name: MTEB StackOverflowDupQuestions + config: default + split: test + revision: e185fbe320c72810689fc5848eb6114e1ef5ec69 + metrics: + - type: map + value: 51.44542568873886 + - type: mrr + value: 52.33656151854681 + - task: + type: Summarization + dataset: + type: mteb/summeval + name: MTEB SummEval + config: default + split: test + revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c + metrics: + - type: cos_sim_pearson + value: 30.75982974997539 + - type: cos_sim_spearman + value: 30.385405026539914 + - type: dot_pearson + value: 30.75982433546523 + - type: dot_spearman + value: 30.385405026539914 + - task: + type: Retrieval + dataset: + type: trec-covid + name: MTEB TRECCOVID + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 0.22799999999999998 + - type: map_at_10 + value: 2.064 + - type: map_at_100 + value: 13.056000000000001 + - type: map_at_1000 + value: 31.747999999999998 + - type: map_at_3 + value: 0.67 + - type: map_at_5 + value: 1.097 + - type: mrr_at_1 + value: 90.0 + - type: mrr_at_10 + value: 94.667 + - type: mrr_at_100 + value: 94.667 + - type: mrr_at_1000 + value: 94.667 + - type: mrr_at_3 + value: 94.667 + - type: mrr_at_5 + value: 94.667 + - type: ndcg_at_1 + value: 86.0 + - type: ndcg_at_10 + value: 82.0 + - type: ndcg_at_100 + value: 64.307 + - type: ndcg_at_1000 + value: 57.023999999999994 + - type: ndcg_at_3 + value: 85.816 + - type: ndcg_at_5 + value: 84.904 + - type: precision_at_1 + value: 90.0 + - type: precision_at_10 + value: 85.8 + - type: precision_at_100 + value: 66.46 + - type: precision_at_1000 + value: 25.202 + - type: precision_at_3 + value: 90.0 + - type: precision_at_5 + value: 89.2 + - type: recall_at_1 + value: 0.22799999999999998 + - type: recall_at_10 + value: 2.235 + - type: recall_at_100 + value: 16.185 + - type: recall_at_1000 + value: 53.620999999999995 + - type: recall_at_3 + value: 0.7040000000000001 + - type: recall_at_5 + value: 1.172 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (sqi-eng) + config: sqi-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.39999999999999 + - type: f1 + value: 96.75 + - type: precision + value: 96.45 + - type: recall + value: 97.39999999999999 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (fry-eng) + config: fry-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 85.54913294797689 + - type: f1 + value: 82.46628131021194 + - type: precision + value: 81.1175337186898 + - type: recall + value: 85.54913294797689 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (kur-eng) + config: kur-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 81.21951219512195 + - type: f1 + value: 77.33333333333334 + - type: precision + value: 75.54878048780488 + - type: recall + value: 81.21951219512195 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (tur-eng) + config: tur-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 98.6 + - type: f1 + value: 98.26666666666665 + - type: precision + value: 98.1 + - type: recall + value: 98.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (deu-eng) + config: deu-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 99.5 + - type: f1 + value: 99.33333333333333 + - type: precision + value: 99.25 + - type: recall + value: 99.5 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (nld-eng) + config: nld-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.8 + - type: f1 + value: 97.2 + - type: precision + value: 96.89999999999999 + - type: recall + value: 97.8 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ron-eng) + config: ron-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.8 + - type: f1 + value: 97.18333333333334 + - type: precision + value: 96.88333333333333 + - type: recall + value: 97.8 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ang-eng) + config: ang-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 77.61194029850746 + - type: f1 + value: 72.81094527363183 + - type: precision + value: 70.83333333333333 + - type: recall + value: 77.61194029850746 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ido-eng) + config: ido-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 93.7 + - type: f1 + value: 91.91666666666667 + - type: precision + value: 91.08333333333334 + - type: recall + value: 93.7 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (jav-eng) + config: jav-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 88.29268292682927 + - type: f1 + value: 85.27642276422765 + - type: precision + value: 84.01277584204414 + - type: recall + value: 88.29268292682927 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (isl-eng) + config: isl-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.1 + - type: f1 + value: 95.0 + - type: precision + value: 94.46666666666668 + - type: recall + value: 96.1 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (slv-eng) + config: slv-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 93.681652490887 + - type: f1 + value: 91.90765492102065 + - type: precision + value: 91.05913325232888 + - type: recall + value: 93.681652490887 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (cym-eng) + config: cym-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 92.17391304347827 + - type: f1 + value: 89.97101449275361 + - type: precision + value: 88.96811594202899 + - type: recall + value: 92.17391304347827 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (kaz-eng) + config: kaz-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 90.43478260869566 + - type: f1 + value: 87.72173913043478 + - type: precision + value: 86.42028985507245 + - type: recall + value: 90.43478260869566 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (est-eng) + config: est-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 90.4 + - type: f1 + value: 88.03 + - type: precision + value: 86.95 + - type: recall + value: 90.4 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (heb-eng) + config: heb-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 93.4 + - type: f1 + value: 91.45666666666666 + - type: precision + value: 90.525 + - type: recall + value: 93.4 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (gla-eng) + config: gla-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 81.9059107358263 + - type: f1 + value: 78.32557872364869 + - type: precision + value: 76.78260286824823 + - type: recall + value: 81.9059107358263 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (mar-eng) + config: mar-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 94.3 + - type: f1 + value: 92.58333333333333 + - type: precision + value: 91.73333333333332 + - type: recall + value: 94.3 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (lat-eng) + config: lat-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 79.10000000000001 + - type: f1 + value: 74.50500000000001 + - type: precision + value: 72.58928571428571 + - type: recall + value: 79.10000000000001 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (bel-eng) + config: bel-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.6 + - type: f1 + value: 95.55 + - type: precision + value: 95.05 + - type: recall + value: 96.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (pms-eng) + config: pms-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 82.0952380952381 + - type: f1 + value: 77.98458049886621 + - type: precision + value: 76.1968253968254 + - type: recall + value: 82.0952380952381 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (gle-eng) + config: gle-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 87.9 + - type: f1 + value: 84.99190476190476 + - type: precision + value: 83.65 + - type: recall + value: 87.9 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (pes-eng) + config: pes-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.7 + - type: f1 + value: 94.56666666666666 + - type: precision + value: 94.01666666666667 + - type: recall + value: 95.7 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (nob-eng) + config: nob-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 98.6 + - type: f1 + value: 98.2 + - type: precision + value: 98.0 + - type: recall + value: 98.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (bul-eng) + config: bul-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.6 + - type: f1 + value: 94.38333333333334 + - type: precision + value: 93.78333333333335 + - type: recall + value: 95.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (cbk-eng) + config: cbk-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 87.4 + - type: f1 + value: 84.10380952380952 + - type: precision + value: 82.67 + - type: recall + value: 87.4 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (hun-eng) + config: hun-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.5 + - type: f1 + value: 94.33333333333334 + - type: precision + value: 93.78333333333333 + - type: recall + value: 95.5 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (uig-eng) + config: uig-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 89.4 + - type: f1 + value: 86.82000000000001 + - type: precision + value: 85.64500000000001 + - type: recall + value: 89.4 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (rus-eng) + config: rus-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.1 + - type: f1 + value: 93.56666666666668 + - type: precision + value: 92.81666666666666 + - type: recall + value: 95.1 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (spa-eng) + config: spa-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 98.9 + - type: f1 + value: 98.6 + - type: precision + value: 98.45 + - type: recall + value: 98.9 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (hye-eng) + config: hye-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.01347708894879 + - type: f1 + value: 93.51752021563343 + - type: precision + value: 92.82794249775381 + - type: recall + value: 95.01347708894879 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (tel-eng) + config: tel-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.00854700854701 + - type: f1 + value: 96.08262108262107 + - type: precision + value: 95.65527065527067 + - type: recall + value: 97.00854700854701 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (afr-eng) + config: afr-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.5 + - type: f1 + value: 95.39999999999999 + - type: precision + value: 94.88333333333333 + - type: recall + value: 96.5 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (mon-eng) + config: mon-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.5909090909091 + - type: f1 + value: 95.49242424242425 + - type: precision + value: 94.9621212121212 + - type: recall + value: 96.5909090909091 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (arz-eng) + config: arz-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 84.90566037735849 + - type: f1 + value: 81.85883997204752 + - type: precision + value: 80.54507337526205 + - type: recall + value: 84.90566037735849 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (hrv-eng) + config: hrv-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.5 + - type: f1 + value: 96.75 + - type: precision + value: 96.38333333333333 + - type: recall + value: 97.5 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (nov-eng) + config: nov-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 86.7704280155642 + - type: f1 + value: 82.99610894941635 + - type: precision + value: 81.32295719844358 + - type: recall + value: 86.7704280155642 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (gsw-eng) + config: gsw-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 67.52136752136752 + - type: f1 + value: 61.89662189662191 + - type: precision + value: 59.68660968660969 + - type: recall + value: 67.52136752136752 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (nds-eng) + config: nds-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 89.2 + - type: f1 + value: 86.32 + - type: precision + value: 85.015 + - type: recall + value: 89.2 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ukr-eng) + config: ukr-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.0 + - type: f1 + value: 94.78333333333333 + - type: precision + value: 94.18333333333334 + - type: recall + value: 96.0 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (uzb-eng) + config: uzb-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 83.8785046728972 + - type: f1 + value: 80.54517133956385 + - type: precision + value: 79.154984423676 + - type: recall + value: 83.8785046728972 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (lit-eng) + config: lit-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 93.60000000000001 + - type: f1 + value: 92.01333333333334 + - type: precision + value: 91.28333333333333 + - type: recall + value: 93.60000000000001 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ina-eng) + config: ina-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.1 + - type: f1 + value: 96.26666666666667 + - type: precision + value: 95.85000000000001 + - type: recall + value: 97.1 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (lfn-eng) + config: lfn-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 84.3 + - type: f1 + value: 80.67833333333333 + - type: precision + value: 79.03928571428571 + - type: recall + value: 84.3 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (zsm-eng) + config: zsm-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.3 + - type: f1 + value: 96.48333333333332 + - type: precision + value: 96.08333333333331 + - type: recall + value: 97.3 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ita-eng) + config: ita-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.7 + - type: f1 + value: 94.66666666666667 + - type: precision + value: 94.16666666666667 + - type: recall + value: 95.7 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (cmn-eng) + config: cmn-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.2 + - type: f1 + value: 96.36666666666667 + - type: precision + value: 95.96666666666668 + - type: recall + value: 97.2 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (lvs-eng) + config: lvs-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 94.3 + - type: f1 + value: 92.80666666666667 + - type: precision + value: 92.12833333333333 + - type: recall + value: 94.3 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (glg-eng) + config: glg-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.0 + - type: f1 + value: 96.22333333333334 + - type: precision + value: 95.875 + - type: recall + value: 97.0 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ceb-eng) + config: ceb-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 74.33333333333333 + - type: f1 + value: 70.78174603174602 + - type: precision + value: 69.28333333333332 + - type: recall + value: 74.33333333333333 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (bre-eng) + config: bre-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 37.6 + - type: f1 + value: 32.938348952090365 + - type: precision + value: 31.2811038961039 + - type: recall + value: 37.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ben-eng) + config: ben-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 91.5 + - type: f1 + value: 89.13333333333333 + - type: precision + value: 88.03333333333333 + - type: recall + value: 91.5 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (swg-eng) + config: swg-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 82.14285714285714 + - type: f1 + value: 77.67857142857143 + - type: precision + value: 75.59523809523809 + - type: recall + value: 82.14285714285714 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (arq-eng) + config: arq-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 69.0450054884742 + - type: f1 + value: 63.070409283362075 + - type: precision + value: 60.58992781824835 + - type: recall + value: 69.0450054884742 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (kab-eng) + config: kab-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 63.1 + - type: f1 + value: 57.848333333333336 + - type: precision + value: 55.69500000000001 + - type: recall + value: 63.1 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (fra-eng) + config: fra-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.1 + - type: f1 + value: 95.01666666666667 + - type: precision + value: 94.5 + - type: recall + value: 96.1 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (por-eng) + config: por-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.89999999999999 + - type: f1 + value: 94.90666666666667 + - type: precision + value: 94.425 + - type: recall + value: 95.89999999999999 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (tat-eng) + config: tat-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 87.6 + - type: f1 + value: 84.61333333333333 + - type: precision + value: 83.27 + - type: recall + value: 87.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (oci-eng) + config: oci-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 76.4 + - type: f1 + value: 71.90746031746032 + - type: precision + value: 70.07027777777778 + - type: recall + value: 76.4 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (pol-eng) + config: pol-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.89999999999999 + - type: f1 + value: 97.26666666666667 + - type: precision + value: 96.95 + - type: recall + value: 97.89999999999999 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (war-eng) + config: war-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 78.8 + - type: f1 + value: 74.39555555555555 + - type: precision + value: 72.59416666666667 + - type: recall + value: 78.8 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (aze-eng) + config: aze-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.19999999999999 + - type: f1 + value: 93.78999999999999 + - type: precision + value: 93.125 + - type: recall + value: 95.19999999999999 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (vie-eng) + config: vie-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.8 + - type: f1 + value: 97.1 + - type: precision + value: 96.75 + - type: recall + value: 97.8 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (nno-eng) + config: nno-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.6 + - type: f1 + value: 94.25666666666666 + - type: precision + value: 93.64166666666668 + - type: recall + value: 95.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (cha-eng) + config: cha-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 56.934306569343065 + - type: f1 + value: 51.461591936044485 + - type: precision + value: 49.37434827945776 + - type: recall + value: 56.934306569343065 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (mhr-eng) + config: mhr-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 20.200000000000003 + - type: f1 + value: 16.91799284049284 + - type: precision + value: 15.791855158730158 + - type: recall + value: 20.200000000000003 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (dan-eng) + config: dan-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.2 + - type: f1 + value: 95.3 + - type: precision + value: 94.85 + - type: recall + value: 96.2 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ell-eng) + config: ell-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.3 + - type: f1 + value: 95.11666666666667 + - type: precision + value: 94.53333333333333 + - type: recall + value: 96.3 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (amh-eng) + config: amh-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 89.88095238095238 + - type: f1 + value: 87.14285714285714 + - type: precision + value: 85.96230158730161 + - type: recall + value: 89.88095238095238 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (pam-eng) + config: pam-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 24.099999999999998 + - type: f1 + value: 19.630969083349783 + - type: precision + value: 18.275094905094907 + - type: recall + value: 24.099999999999998 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (hsb-eng) + config: hsb-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 83.4368530020704 + - type: f1 + value: 79.45183870649709 + - type: precision + value: 77.7432712215321 + - type: recall + value: 83.4368530020704 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (srp-eng) + config: srp-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.8 + - type: f1 + value: 94.53333333333333 + - type: precision + value: 93.91666666666666 + - type: recall + value: 95.8 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (epo-eng) + config: epo-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 98.8 + - type: f1 + value: 98.48333333333332 + - type: precision + value: 98.33333333333334 + - type: recall + value: 98.8 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (kzj-eng) + config: kzj-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 17.5 + - type: f1 + value: 14.979285714285714 + - type: precision + value: 14.23235060690943 + - type: recall + value: 17.5 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (awa-eng) + config: awa-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 93.93939393939394 + - type: f1 + value: 91.991341991342 + - type: precision + value: 91.05339105339105 + - type: recall + value: 93.93939393939394 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (fao-eng) + config: fao-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 89.31297709923665 + - type: f1 + value: 86.76844783715012 + - type: precision + value: 85.63613231552164 + - type: recall + value: 89.31297709923665 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (mal-eng) + config: mal-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 99.12663755458514 + - type: f1 + value: 98.93255701115964 + - type: precision + value: 98.83551673944687 + - type: recall + value: 99.12663755458514 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ile-eng) + config: ile-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 92.0 + - type: f1 + value: 89.77999999999999 + - type: precision + value: 88.78333333333333 + - type: recall + value: 92.0 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (bos-eng) + config: bos-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.89265536723164 + - type: f1 + value: 95.85687382297553 + - type: precision + value: 95.33898305084746 + - type: recall + value: 96.89265536723164 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (cor-eng) + config: cor-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 14.6 + - type: f1 + value: 11.820611790170615 + - type: precision + value: 11.022616224355355 + - type: recall + value: 14.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (cat-eng) + config: cat-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.89999999999999 + - type: f1 + value: 94.93333333333334 + - type: precision + value: 94.48666666666666 + - type: recall + value: 95.89999999999999 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (eus-eng) + config: eus-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 87.6 + - type: f1 + value: 84.72333333333334 + - type: precision + value: 83.44166666666666 + - type: recall + value: 87.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (yue-eng) + config: yue-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 94.8 + - type: f1 + value: 93.47333333333333 + - type: precision + value: 92.875 + - type: recall + value: 94.8 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (swe-eng) + config: swe-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.6 + - type: f1 + value: 95.71666666666665 + - type: precision + value: 95.28333333333335 + - type: recall + value: 96.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (dtp-eng) + config: dtp-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 17.8 + - type: f1 + value: 14.511074040901628 + - type: precision + value: 13.503791000666002 + - type: recall + value: 17.8 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (kat-eng) + config: kat-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 94.10187667560321 + - type: f1 + value: 92.46648793565683 + - type: precision + value: 91.71134941912423 + - type: recall + value: 94.10187667560321 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (jpn-eng) + config: jpn-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.0 + - type: f1 + value: 96.11666666666666 + - type: precision + value: 95.68333333333334 + - type: recall + value: 97.0 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (csb-eng) + config: csb-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 72.72727272727273 + - type: f1 + value: 66.58949745906267 + - type: precision + value: 63.86693017127799 + - type: recall + value: 72.72727272727273 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (xho-eng) + config: xho-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 90.14084507042254 + - type: f1 + value: 88.26291079812206 + - type: precision + value: 87.32394366197182 + - type: recall + value: 90.14084507042254 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (orv-eng) + config: orv-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 64.67065868263472 + - type: f1 + value: 58.2876627696987 + - type: precision + value: 55.79255774165953 + - type: recall + value: 64.67065868263472 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ind-eng) + config: ind-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 95.6 + - type: f1 + value: 94.41666666666667 + - type: precision + value: 93.85 + - type: recall + value: 95.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (tuk-eng) + config: tuk-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 55.172413793103445 + - type: f1 + value: 49.63992493549144 + - type: precision + value: 47.71405113769646 + - type: recall + value: 55.172413793103445 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (max-eng) + config: max-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 77.46478873239437 + - type: f1 + value: 73.4417616811983 + - type: precision + value: 71.91607981220658 + - type: recall + value: 77.46478873239437 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (swh-eng) + config: swh-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 84.61538461538461 + - type: f1 + value: 80.91452991452994 + - type: precision + value: 79.33760683760683 + - type: recall + value: 84.61538461538461 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (hin-eng) + config: hin-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 98.2 + - type: f1 + value: 97.6 + - type: precision + value: 97.3 + - type: recall + value: 98.2 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (dsb-eng) + config: dsb-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 75.5741127348643 + - type: f1 + value: 72.00417536534445 + - type: precision + value: 70.53467872883321 + - type: recall + value: 75.5741127348643 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ber-eng) + config: ber-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 62.2 + - type: f1 + value: 55.577460317460314 + - type: precision + value: 52.98583333333333 + - type: recall + value: 62.2 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (tam-eng) + config: tam-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 92.18241042345277 + - type: f1 + value: 90.6468124709167 + - type: precision + value: 89.95656894679696 + - type: recall + value: 92.18241042345277 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (slk-eng) + config: slk-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.1 + - type: f1 + value: 95.13333333333333 + - type: precision + value: 94.66666666666667 + - type: recall + value: 96.1 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (tgl-eng) + config: tgl-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 96.8 + - type: f1 + value: 95.85000000000001 + - type: precision + value: 95.39999999999999 + - type: recall + value: 96.8 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ast-eng) + config: ast-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 92.1259842519685 + - type: f1 + value: 89.76377952755905 + - type: precision + value: 88.71391076115485 + - type: recall + value: 92.1259842519685 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (mkd-eng) + config: mkd-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 94.1 + - type: f1 + value: 92.49 + - type: precision + value: 91.725 + - type: recall + value: 94.1 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (khm-eng) + config: khm-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 77.5623268698061 + - type: f1 + value: 73.27364463791058 + - type: precision + value: 71.51947852086357 + - type: recall + value: 77.5623268698061 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ces-eng) + config: ces-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.39999999999999 + - type: f1 + value: 96.56666666666666 + - type: precision + value: 96.16666666666667 + - type: recall + value: 97.39999999999999 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (tzl-eng) + config: tzl-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 66.34615384615384 + - type: f1 + value: 61.092032967032964 + - type: precision + value: 59.27197802197802 + - type: recall + value: 66.34615384615384 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (urd-eng) + config: urd-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 94.89999999999999 + - type: f1 + value: 93.41190476190476 + - type: precision + value: 92.7 + - type: recall + value: 94.89999999999999 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (ara-eng) + config: ara-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 93.10000000000001 + - type: f1 + value: 91.10000000000001 + - type: precision + value: 90.13333333333333 + - type: recall + value: 93.10000000000001 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (kor-eng) + config: kor-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 93.7 + - type: f1 + value: 91.97333333333334 + - type: precision + value: 91.14166666666667 + - type: recall + value: 93.7 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (yid-eng) + config: yid-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 92.21698113207547 + - type: f1 + value: 90.3796046720575 + - type: precision + value: 89.56367924528303 + - type: recall + value: 92.21698113207547 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (fin-eng) + config: fin-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.6 + - type: f1 + value: 96.91666666666667 + - type: precision + value: 96.6 + - type: recall + value: 97.6 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (tha-eng) + config: tha-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 97.44525547445255 + - type: f1 + value: 96.71532846715328 + - type: precision + value: 96.35036496350365 + - type: recall + value: 97.44525547445255 + - task: + type: BitextMining + dataset: + type: mteb/tatoeba-bitext-mining + name: MTEB Tatoeba (wuu-eng) + config: wuu-eng + split: test + revision: 9080400076fbadbb4c4dcb136ff4eddc40b42553 + metrics: + - type: accuracy + value: 94.1 + - type: f1 + value: 92.34000000000002 + - type: precision + value: 91.49166666666667 + - type: recall + value: 94.1 + - task: + type: Retrieval + dataset: + type: webis-touche2020 + name: MTEB Touche2020 + config: default + split: test + revision: None + metrics: + - type: map_at_1 + value: 3.2910000000000004 + - type: map_at_10 + value: 10.373000000000001 + - type: map_at_100 + value: 15.612 + - type: map_at_1000 + value: 17.06 + - type: map_at_3 + value: 6.119 + - type: map_at_5 + value: 7.917000000000001 + - type: mrr_at_1 + value: 44.897999999999996 + - type: mrr_at_10 + value: 56.054 + - type: mrr_at_100 + value: 56.82000000000001 + - type: mrr_at_1000 + value: 56.82000000000001 + - type: mrr_at_3 + value: 52.381 + - type: mrr_at_5 + value: 53.81 + - type: ndcg_at_1 + value: 42.857 + - type: ndcg_at_10 + value: 27.249000000000002 + - type: ndcg_at_100 + value: 36.529 + - type: ndcg_at_1000 + value: 48.136 + - type: ndcg_at_3 + value: 33.938 + - type: ndcg_at_5 + value: 29.951 + - type: precision_at_1 + value: 44.897999999999996 + - type: precision_at_10 + value: 22.653000000000002 + - type: precision_at_100 + value: 7.000000000000001 + - type: precision_at_1000 + value: 1.48 + - type: precision_at_3 + value: 32.653 + - type: precision_at_5 + value: 27.755000000000003 + - type: recall_at_1 + value: 3.2910000000000004 + - type: recall_at_10 + value: 16.16 + - type: recall_at_100 + value: 43.908 + - type: recall_at_1000 + value: 79.823 + - type: recall_at_3 + value: 7.156 + - type: recall_at_5 + value: 10.204 + - task: + type: Classification + dataset: + type: mteb/toxic_conversations_50k + name: MTEB ToxicConversationsClassification + config: default + split: test + revision: d7c0de2777da35d6aae2200a62c6e0e5af397c4c + metrics: + - type: accuracy + value: 71.05879999999999 + - type: ap + value: 14.609748142799111 + - type: f1 + value: 54.878956295843096 + - task: + type: Classification + dataset: + type: mteb/tweet_sentiment_extraction + name: MTEB TweetSentimentExtractionClassification + config: default + split: test + revision: d604517c81ca91fe16a244d1248fc021f9ecee7a + metrics: + - type: accuracy + value: 64.61799660441426 + - type: f1 + value: 64.8698191961434 + - task: + type: Clustering + dataset: + type: mteb/twentynewsgroups-clustering + name: MTEB TwentyNewsgroupsClustering + config: default + split: test + revision: 6125ec4e24fa026cec8a478383ee943acfbd5449 + metrics: + - type: v_measure + value: 51.32860036611885 + - task: + type: PairClassification + dataset: + type: mteb/twittersemeval2015-pairclassification + name: MTEB TwitterSemEval2015 + config: default + split: test + revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1 + metrics: + - type: cos_sim_accuracy + value: 88.34714192048638 + - type: cos_sim_ap + value: 80.26732975975634 + - type: cos_sim_f1 + value: 73.53415148134374 + - type: cos_sim_precision + value: 69.34767360299276 + - type: cos_sim_recall + value: 78.25857519788919 + - type: dot_accuracy + value: 88.34714192048638 + - type: dot_ap + value: 80.26733698491206 + - type: dot_f1 + value: 73.53415148134374 + - type: dot_precision + value: 69.34767360299276 + - type: dot_recall + value: 78.25857519788919 + - type: euclidean_accuracy + value: 88.34714192048638 + - type: euclidean_ap + value: 80.26734337771738 + - type: euclidean_f1 + value: 73.53415148134374 + - type: euclidean_precision + value: 69.34767360299276 + - type: euclidean_recall + value: 78.25857519788919 + - type: manhattan_accuracy + value: 88.30541813196639 + - type: manhattan_ap + value: 80.19415808104145 + - type: manhattan_f1 + value: 73.55143870713441 + - type: manhattan_precision + value: 73.25307511122743 + - type: manhattan_recall + value: 73.85224274406332 + - type: max_accuracy + value: 88.34714192048638 + - type: max_ap + value: 80.26734337771738 + - type: max_f1 + value: 73.55143870713441 + - task: + type: PairClassification + dataset: + type: mteb/twitterurlcorpus-pairclassification + name: MTEB TwitterURLCorpus + config: default + split: test + revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf + metrics: + - type: cos_sim_accuracy + value: 89.81061047075717 + - type: cos_sim_ap + value: 87.11747055081017 + - type: cos_sim_f1 + value: 80.04355498817256 + - type: cos_sim_precision + value: 78.1165262000733 + - type: cos_sim_recall + value: 82.06806282722513 + - type: dot_accuracy + value: 89.81061047075717 + - type: dot_ap + value: 87.11746902745236 + - type: dot_f1 + value: 80.04355498817256 + - type: dot_precision + value: 78.1165262000733 + - type: dot_recall + value: 82.06806282722513 + - type: euclidean_accuracy + value: 89.81061047075717 + - type: euclidean_ap + value: 87.11746919324248 + - type: euclidean_f1 + value: 80.04355498817256 + - type: euclidean_precision + value: 78.1165262000733 + - type: euclidean_recall + value: 82.06806282722513 + - type: manhattan_accuracy + value: 89.79508673885202 + - type: manhattan_ap + value: 87.11074390832218 + - type: manhattan_f1 + value: 80.13002540726349 + - type: manhattan_precision + value: 77.83826945412311 + - type: manhattan_recall + value: 82.56082537727133 + - type: max_accuracy + value: 89.81061047075717 + - type: max_ap + value: 87.11747055081017 + - type: max_f1 + value: 80.13002540726349 +language: +- multilingual +- af +- am +- ar +- as +- az +- be +- bg +- bn +- br +- bs +- ca +- cs +- cy +- da +- de +- el +- en +- eo +- es +- et +- eu +- fa +- fi +- fr +- fy +- ga +- gd +- gl +- gu +- ha +- he +- hi +- hr +- hu +- hy +- id +- is +- it +- ja +- jv +- ka +- kk +- km +- kn +- ko +- ku +- ky +- la +- lo +- lt +- lv +- mg +- mk +- ml +- mn +- mr +- ms +- my +- ne +- nl +- 'no' +- om +- or +- pa +- pl +- ps +- pt +- ro +- ru +- sa +- sd +- si +- sk +- sl +- so +- sq +- sr +- su +- sv +- sw +- ta +- te +- th +- tl +- tr +- ug +- uk +- ur +- uz +- vi +- xh +- yi +- zh +license: mit +--- + +## Multilingual-E5-large-instruct + +[Multilingual E5 Text Embeddings: A Technical Report](https://arxiv.org/pdf/2402.05672). +Liang Wang, Nan Yang, Xiaolong Huang, Linjun Yang, Rangan Majumder, Furu Wei, arXiv 2024 + +This model has 24 layers and the embedding size is 1024. + +## Usage + +Below are examples to encode queries and passages from the MS-MARCO passage ranking dataset. + +### Transformers + +```python +import torch.nn.functional as F + +from torch import Tensor +from transformers import AutoTokenizer, AutoModel + + +def average_pool(last_hidden_states: Tensor, + attention_mask: Tensor) -> Tensor: + last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) + return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] + +def get_detailed_instruct(task_description: str, query: str) -> str: + return f'Instruct: {task_description}\nQuery: {query}' + +# Each query must come with a one-sentence instruction that describes the task +task = 'Given a web search query, retrieve relevant passages that answer the query' +queries = [ + get_detailed_instruct(task, 'how much protein should a female eat'), + get_detailed_instruct(task, '南瓜的家常做法') +] +# No need to add instruction for retrieval documents +documents = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀 6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅" +] +input_texts = queries + documents + +tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large-instruct') +model = AutoModel.from_pretrained('intfloat/multilingual-e5-large-instruct') + +# Tokenize the input texts +batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt') + +outputs = model(**batch_dict) +embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) + +# normalize embeddings +embeddings = F.normalize(embeddings, p=2, dim=1) +scores = (embeddings[:2] @ embeddings[2:].T) * 100 +print(scores.tolist()) +# => [[91.92852783203125, 67.580322265625], [70.3814468383789, 92.1330795288086]] +``` + +### Sentence Transformers + +```python +from sentence_transformers import SentenceTransformer + +def get_detailed_instruct(task_description: str, query: str) -> str: + return f'Instruct: {task_description}\nQuery: {query}' + +# Each query must come with a one-sentence instruction that describes the task +task = 'Given a web search query, retrieve relevant passages that answer the query' +queries = [ + get_detailed_instruct(task, 'how much protein should a female eat'), + get_detailed_instruct(task, '南瓜的家常做法') +] +# No need to add instruction for retrieval documents +documents = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀 6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅" +] +input_texts = queries + documents + +model = SentenceTransformer('intfloat/multilingual-e5-large-instruct') + +embeddings = model.encode(input_texts, convert_to_tensor=True, normalize_embeddings=True) +scores = (embeddings[:2] @ embeddings[2:].T) * 100 +print(scores.tolist()) +# [[91.92853546142578, 67.5802993774414], [70.38143157958984, 92.13307189941406]] +``` + +### Infinity + +Usage with [Infinity](https://github.com/michaelfeil/infinity): + +```bash +docker run --gpus all -v $PWD/data:/app/.cache -e HF_TOKEN=$HF_TOKEN -p "7997":"7997" \ +michaelf34/infinity:0.0.68 \ +v2 --model-id intfloat/multilingual-e5-large-instruct --revision "main" --dtype float16 --batch-size 32 --engine torch --port 7997 +``` + +## Supported Languages + +This model is initialized from [xlm-roberta-large](https://huggingface.co/xlm-roberta-large) +and continually trained on a mixture of multilingual datasets. +It supports 100 languages from xlm-roberta, +but low-resource languages may see performance degradation. + +## Training Details + +**Initialization**: [xlm-roberta-large](https://huggingface.co/xlm-roberta-large) + +**First stage**: contrastive pre-training with 1 billion weakly supervised text pairs. + +**Second stage**: fine-tuning on datasets from the [E5-mistral](https://arxiv.org/abs/2401.00368) paper. + +## MTEB Benchmark Evaluation + +Check out [unilm/e5](https://github.com/microsoft/unilm/tree/master/e5) to reproduce evaluation results +on the [BEIR](https://arxiv.org/abs/2104.08663) and [MTEB benchmark](https://arxiv.org/abs/2210.07316). + +## FAQ + +**1. Do I need to add instructions to the query?** + +Yes, this is how the model is trained, otherwise you will see a performance degradation. +The task definition should be a one-sentence instruction that describes the task. +This is a way to customize text embeddings for different scenarios through natural language instructions. + +Please check out [unilm/e5/utils.py](https://github.com/microsoft/unilm/blob/9c0f1ff7ca53431fe47d2637dfe253643d94185b/e5/utils.py#L106) for instructions we used for evaluation. + +On the other hand, there is no need to add instructions to the document side. + +**2. Why are my reproduced results slightly different from reported in the model card?** + +Different versions of `transformers` and `pytorch` could cause negligible but non-zero performance differences. + +**3. Why does the cosine similarity scores distribute around 0.7 to 1.0?** + +This is a known and expected behavior as we use a low temperature 0.01 for InfoNCE contrastive loss. + +For text embedding tasks like text retrieval or semantic similarity, +what matters is the relative order of the scores instead of the absolute values, +so this should not be an issue. + +## Citation + +If you find our paper or models helpful, please consider cite as follows: + +``` +@article{wang2024multilingual, + title={Multilingual E5 Text Embeddings: A Technical Report}, + author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu}, + journal={arXiv preprint arXiv:2402.05672}, + year={2024} +} +``` + +## Limitations + +Long texts will be truncated to at most 512 tokens. diff --git a/victord/sub19/models/multilingual-e5-large-instruct/config.json b/victord/sub19/models/multilingual-e5-large-instruct/config.json new file mode 100644 index 0000000000000000000000000000000000000000..97a8add06d50588ae7f902808c2101e4fe5adedd --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/config.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "XLMRobertaModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "classifier_dropout": null, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "xlm-roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/victord/sub19/models/multilingual-e5-large-instruct/config_sentence_transformers.json b/victord/sub19/models/multilingual-e5-large-instruct/config_sentence_transformers.json new file mode 100644 index 0000000000000000000000000000000000000000..aaa499896f85c8e2ba321476fd1479e476145577 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "__version__": { + "sentence_transformers": "5.1.1", + "transformers": "4.53.2", + "pytorch": "2.9.0+cu128" + }, + "model_type": "SentenceTransformer", + "prompts": { + "query": "", + "document": "" + }, + "default_prompt_name": null, + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/victord/sub19/models/multilingual-e5-large-instruct/model.safetensors b/victord/sub19/models/multilingual-e5-large-instruct/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0384b7096532bb7b0e04bae0c014649f6b09642f --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23e75f34a6a87a71c6a6a4a9244c0520599b3593a9cb13275fa4a92a970fb85a +size 1119826072 diff --git a/victord/sub19/models/multilingual-e5-large-instruct/modules.json b/victord/sub19/models/multilingual-e5-large-instruct/modules.json new file mode 100644 index 0000000000000000000000000000000000000000..952a9b81c0bfd99800fabf352f69c7ccd46c5e43 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/victord/sub19/models/multilingual-e5-large-instruct/sentence_bert_config.json b/victord/sub19/models/multilingual-e5-large-instruct/sentence_bert_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4eca68d85ecd3034cf4174d8a4033a75344ea62d --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 512, + "do_lower_case": false +} \ No newline at end of file diff --git a/victord/sub19/models/multilingual-e5-large-instruct/sentencepiece.bpe.model b/victord/sub19/models/multilingual-e5-large-instruct/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/victord/sub19/models/multilingual-e5-large-instruct/special_tokens_map.json b/victord/sub19/models/multilingual-e5-large-instruct/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b1879d702821e753ffe4245048eee415d54a9385 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/victord/sub19/models/multilingual-e5-large-instruct/tokenizer.json b/victord/sub19/models/multilingual-e5-large-instruct/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..7126b1d228f6e5f725b6ee02a88b1118c725aeed --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a56def25aa40facc030ea8b0b87f3688e4b3c39eb8b45d5702b3a1300fe2a20 +size 17082734 diff --git a/victord/sub19/models/multilingual-e5-large-instruct/tokenizer_config.json b/victord/sub19/models/multilingual-e5-large-instruct/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0df9458a41642f96287ae84a989281a3bdcebacf --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large-instruct/tokenizer_config.json @@ -0,0 +1,56 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "" +} diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/1_Pooling/config.json b/victord/sub19/models/multilingual-e5-large_pseudo_full/1_Pooling/config.json new file mode 100644 index 0000000000000000000000000000000000000000..3f060ee536308b48017dad1a834f306f115695a3 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 1024, + "pooling_mode_cls_token": false, + "pooling_mode_mean_tokens": true, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": false, + "include_prompt": true +} \ No newline at end of file diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/README.md b/victord/sub19/models/multilingual-e5-large_pseudo_full/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7f748db8bd1ed8ec8948c3811fe084e038b15655 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/README.md @@ -0,0 +1,773 @@ +--- +tags: +- sentence-transformers +- sentence-similarity +- feature-extraction +- dense +- generated_from_trainer +- dataset_size:417864 +- loss:CoSENTLoss +base_model: intfloat/multilingual-e5-large +widget: +- source_sentence: 'query: מה התהליך לזיהוי חשודים בפגיעות בחברי כנסת?' + sentences: + - 'passage: צוואת אלפרד נובל + + חמשת הפרסים המקוריים המוענקים מקרן נובל ניתנים על פי צוואתו של אלפרד נובל, כימאי + ותעשיין שהמציא את הדינמיט והתעשר מכך. אף שנובל כתב מספר צוואות במהלך חייו, על + האחרונה שבהן חתם כשנה בטרם מותו במועדון השוודי בפריז, ביום 27 בנובמבר 1895. פיתוחיו + של נובל נגעו ישירות בפיתוח וייצור חומרי נפץ והוא חש אי נוחות גוברת לנוכח השימוש + הצבאי בהמצאותיו. מסופר כי אחד הגורמים לצוואתו האחרונה היה ידיעה שפורסמה על מותו + בעיתון צרפתי, זאת בטרם עת ובטעות, כאשר הלך לעולמו אחיו לודוויג. אותה ידיעה שגויה + על מות נובל הופיעה תחת הכותרת "סוחר המוות מת" (צרפתית: Le marchand de la mort + est mort‏). + + נובל ציווה לתת 94% מכל רכושו, סך הכל 31 מיליון קרונות שוודיות, לייסוד קרן להענקת + חמישה פרסים שנתיים, וכך כתב בצוואתו:' + - "passage: פה יש דוגמה אחת מתוך בלוג של בלוגרית סעודית, אבל אני חושב ששווה להסתכל\ + \ דווקא על הדוגמה השנייה: זה יריד תיירות שמתקיים באמירויות בזמן המבצע. הוא לא\ + \ התבטל, הוא התקיים. זה חלק מהמאמץ לשמר נורמליזציה ולדאוג שהמדינות האלה יישארו\ + \ on track.\nכמה דברים על המספרים בתוך המבצע הזה. קודם כל, גם בתקשורת העולמית\ + \ המדינתית וגם בבין-לאומית הייתה שליטה מוחלטת בדוברים של מדינת ישראל. אני יכול\ + \ להגיד שזה לא בהכרח אומר שזה לחיוב כי הוויכוח של הדוברים שלנו לא היה עימות עם\ + \ דוברי חמאס. אני יכול להגיד שאין לגיטימציה לחמאס בעולם, אבל המיקוד היה בסבל כמו\ + \ במערכות – אני בוגר מחומת מגן – ועד בכלל של מערכות כאלה. המיקוד נוטה מהר מאוד\ + \ לעסוק בסבל הפלסטיני, בפגיעות בפלסטינים. אני יכול להגיד שבהשוואה למבצעים אחרים,\ + \ היינו במקום יותר טוב בעניין הזה. היה הרבה פחות מהנושא הזה וזה ייאמר לחיוב. הנציגויות\ + \ שלנו בחו\"ל קיימו 734 ראיונות - - -\n\nאופיר סופר (האיחוד הלאומי): \n\nהיה\ + \ פחות בנושא הזה – מה כן היה? מה מילא את הוואקום?" + - "passage: גלי קרן: \n\nנעים מאוד. הצעת החוק שמונחת בפני הוועדה כעת, הצעת חוק\ + \ לייעול האכיפה והפיקוח העירוניים ברשויות המקומיות (הוראת שעה) (תיקון מס' 6) (הפיכה\ + \ להוראת קבע), התשפ\"א-2021, תכף אדבר על הסיכום. הצעת החוק נוגעת לתוכנית השיטור\ + \ העירוני – אני מניחה שכולם מכירים את זה יותר בשם הזה. בעצם מדובר בהצעת חוק שחוקקה\ + \ בשנת 2011 בהוראת שעה שהינה צעד משלים להחלטת ממשלה 1848 משנת 2010, שהקימה את\ + \ מערכי האכיפה העירוניים. מערכי האכיפה העירוניים הם מערכים משותפים של משטרת ישראל\ + \ ופקחי הרשויות המקומיות שפועלים במשותף – הן למניעת עבירות איכות חיים, והן למניעת\ + \ אלימות. בשנות פעילות המערכים האלה הגדלנו את מספר הרשויות המקומיות – כמובן באישור\ + \ ועדת הפנים והגנת הסביבה של הכנסת – שבהם פועלים המערכים. כיום הם פועלים ב-71\ + \ רשויות מקומיות בסך הכול. \n\nהחוק הסדיר שני נושאים מרכזיים. הראשון, פרק ג' –\ + \ החוק שעוסק בסמכויות פקחים עירוניים באכיפת חוקי עזר עירוניים. וחלק ד' לחוק שעוסק\ + \ במהלך האכיפה העירונית, מסדיר מספר נושאים שקשורים לפעילות מערך האכיפה העירונית,\ + \ ובין השאר מאפשר לשר לביטחון הפנים להסמיך פקחים עירוניים בסמכויות נוספות לצורך\ + \ סיוע למשטרת ישראל בפעולות למניעת אלימות. כפי שאמרתי, המערכים והחוק פעילים משנת\ + \ 2011. רצינו מספר פעמים להפוך את הוראת השעה להוראה קבועה – זאת גם ההצעה המקורית\ + \ שהונחה על שולחן הוועדה, אבל רק לקריאה ראשונה. הבנתי שהסיכום שאליו הגיעו הוא\ + \ שבשלב זה נפעל להארכת הוראת השעה לתקופה נוספת." +- source_sentence: 'query: כמה שעות מותר לעבוד בסך הכל יחד עם שעות נוספות?' + sentences: + - "passage: מדריך למעסיק לניהול עובדים\n שעות נוספות ועבודה במשמרת לילה\n\n\nהשאלה\n\ + \nעיקר התשובה\n\nראו הרחבה\n\nדגשים למעסיק\n\n\nהאם יש מגבלה על מספר השעות שבהם\ + \ אני יכול להעסיק עובדים?\n\n\nהעסקת עובד מעבר לשעות של יום עבודה מלא או שבוע\ + \ עבודה מלא (42 שעות) מוגדרת כהעסקה בשעות נוספות.\nאסור להעסיק עובדים יותר מ-12\ + \ שעות ביום, כולל שעות נוספות\nבכל מקרה, אסור שסך השעות הנוספות של העובד במהלך\ + \ כל השבוע יהיה מעל ל-16 שעות\nאסור להעסיק עובד המועסק בעבודת לילה יותר מ-58 שעות\ + \ בשבוע, כולל שעות נוספות\nאסור להעסיק עובדים בשעות נוספות באופן קבוע ללא היתר\ + \ משר הכלכלה (היתרים כאלה ניתנו, למשל, למעסיקים של עובדי שמירה ואבטחה ועובדים\ + \ בבתי מלון, בבתי הארחה, בבתי קפה ובמסעדות).\nאסור להעסיק עובדת החל מהחודש החמישי\ + \ להריונה בשעות נוספות.\n\n\nמגבלות על העסקה בשעות נוספות\nגמול עבור שעות נוספות\n\ + \n\nבכל מקרה יש לשלם לעובדים גמול מיוחד עבור עבודתם בשעות הנוספות.\n\n\nכמה עלי\ + \ לשלם לעובד עבור שעות נוספות?\n\n\nשבוע עבודה מחושב לפי בסיס של 42 שעות שבועיות\ + \ ויום עבודה מלא מחושב על בסיס של 8 שעות ו-36 דקות במשך 4 ימים בשבוע, וביום המקוצר\ + \ על 7 שעות ו-36 דקות ביום (8 שעות ביום במקומות עבודה בהם מונהג בשבוע עבודה בן\ + \ 6 ימים).\nשעות העבודה מעבר לתקן היומי או השבועי נחשבות כשעות נוספות.\nביום חול\ + \ עליך לשלם לעובד תוספת של 25% עבור השעתיים הנוספות הראשונות ותוספת של 50% החל\ + \ מהשעה השלישית ואילך.\n\n\nיום עבודה ושבוע עבודה\nגמול עבור שעות נוספות\nגמול\ + \ גלובלי עבור עבודה בשעות נוספות\nעבודה במשמרת לילה\n\n\nאסור למעסיק לעשות קיזוז\ + \ בין ימים בהם עובד החסיר שעות עבודה, לבין ימים בהם עבד שעות נוספות - למעט במקרים\ + \ של העסקה במתכונת של \"שעות גמישות\".\nלמרות זאת, במקרה של העסקה גמישה (כאשר\ + \ על העובד להשלים מכסות שעות חודשית מסוימת, ולעובד נתונה הזכות המלאה לבחור את\ + \ המועדים שבהם הוא מבצע את העבודה), רשאי מעסיק לאפשר לעובד לעבוד יום עבודה שאינו\ + \ מלא ביום מסוים מבלי ששכרו יופחת ולהשלים את השעות החסרות ביום אחר באותו חודש\ + \ מבלי לקבל גמול שעות נוספות עבור אותו יום.\nאם המעסיק משלם לעובד גמול שעות נוספות\ + \ גלובליות, יש להבטיח כי התמורה בעד שעות העבודה הרגילות והנוספות גם יחד לא תפחת\ + \ מזו שהייתה משתלמת לעובד אילו נערך בעדו רישום פרטני של שעות עבודתו ופיקוח מדויק\ + \ עליהן.\nעובד זר המועסק בסיעוד אינו זכאי לגמול עבור שעות נוספות.\n\n\nמהי משמרת\ + \ לילה?\n\n\nעבודה במשמרת לילה היא עבודה שלפחות שעתיים ממנה הן בין השעות 22:00\ + \ ל-06:00.\n7 שעות עבודה במשמרת לילה נחשבות ליום עבודה מלא וכל שעה מעבר להן תיחשב\ + \ לשעה נוספת המזכה בגמול עבור שעות נוספות.\nעובדי אולמות וגני אירועים המועסקים\ + \ במשמרת לילה זכאים לגמול עבור שעות נוספות מעבר ל-6 השעות הראשונות.\n\n\nעבודה\ + \ במשמרת לילה\nגמול עבור שעות נוספות במשמרת לילה לעובדי אולמות וגני אירועים\n\n\ + \n\nהאם מותר לי להעסיק עובדים בשעות הלילה ללא הגבלה?\n\n\nקיימות הגבלות על העסקה\ + \ בשעות לילה.\nבמקום עבודה שעובדים בו במשמרות, אסור להעסיק עובד במשמרות לילה יותר\ + \ משבוע אחד מתוך שבועיים.\nאסור להעסיק עובד המועסק במשמרת לילה יותר מ-58 שעות\ + \ בשבוע, כולל שעות נוספות\nאסור להעסיק עובדת בהיריון החל מהחודש החמישי להיריון,\ + \ שהודיעה למעסיק בכתב כי היא אינה מסכימה לעבוד בלילה.\nעובדת שחזרה מחופשת לידה\ + \ לא תועסק בעבודת לילה במשך 4 חודשים מתום החופשה.\nאין להעסיק בני נוער בעבודת\ + \ לילה, אלא באישור מיוחד משר הכלכלה. אסור להעסיק נער מתחת לגיל 16 בין השעות 20:00\ + \ בערב ל-08:00 בבוקר למחרת.\n\n\nעבודה במשמרת לילה\nאיסור עבודת לילה מעל שבוע\n\ + הגבלת שעות עבודה לאחר חופשת לידה\nאיסור העסקת נוער בעבודת לילה\nהגבלת עבודה בהיריון\n\ + \n\nבתקופה של חופשת לימודים רשמית ניתן להעסיק בני נוער בגילאי 16- 18 עד השעה 24:00\ + \ גם ללא היתר משר הכלכלה. אם ההעסקה הסתיימה לאחר השעה 23:00, על המעסיק להחזיר\ + \ את הנער לביתו, שלא בתחבורה ציבורית, מיד לאחר סיום העבודה." + - 'passage: מסטיק או גומי לעיסה הוא סוג של ממתק הנועד ללעיסה. + + בני המאיה נהגו ללעוס את שרף הגומי המופק משרף עץ הספודיליה (שמו, באזור המצוי היום + בתחומי מקסיקו, היה "צ''יקלה"). המצאת המסטיק ה"מודרני" מיוחסת לתומס אדמס, שבשנת + 1871 ניסה לטעום את חומר הגלם שלו, בשעה שניסה לייצר צעצועים משרף עץ הספודיליה (Sapodilla), + עץ הגדל ביערות הגשם הטרופיים של מרכז אמריקה. + + במקביל להמצאת המסטיק ביבשת אמריקה, תועד שימוש במסטיק העשוי משרף נחשים, על ידי + שבטים נודדים במצרים העתיקה בתקופה הפרה-הלניסטית. בשל אופן ייצורו, משרף נחשים ארסיים, + שימש המסטיק בטקסי התבגרות ופיריון כקמע וכאמצעי קדוש עליו היו שומרים. בעת חתונה + הכלה והחתן לעסו מסטיק במשותף כסמל לאהבתם הנצחית.' + - 'passage: גובה הריבית. + + סוג ההצמדה למדד: לא צמוד; צמוד לריבית הפריים; צמוד לדולר; צמוד למדד המחירים לצרכן; + צמוד למדד תשומות הבנייה; צמוד לערך עוגן כלשהו – או צירוף של הנ"ל. + + תכיפות השתנות הריבית (קבועה, חודשית, אחת לשנה, אחת לחמש שנים וכו''). + + משך ההלוואה (לרוב בין 5 ל־30 שנים). + + לוח הסילוקין (לוח שפיצר, בוליט, קרן שווה). + + מספר ההזדמנויות ומועד ההזדמנויות שבהן ניתן לפרוע את ההלוואה ("נקודות יציאה"). + + קיימים תשעה מסלולי משכנתה לדיור עיקריים:' +- source_sentence: 'query: איזו עצם מגנה על הלב?' + sentences: + - 'passage: סוגי רקמת חיבור + + רקמת חיבור יסודית + + קיימים שני סוגים של רקמת חיבור יסודית (connective tissue proper), כאשר כל אחד + מהם בנוי מאותם מרכיבים, אך בהרכב שונה. + + רקמת חיבור רפה (נקראת גם רקמת חיבור רופפת או תחוחה) היא רקמת החיבור הנפוצה ביותר. + היא נמצאת בשכבת הרירית המיוחדת של מערכת העיכול ודרכי הנשימה, בקרומים הנסיוביים + של חללי הצפק, האדר ומסב הלב, בחללים בין סיבי שריר, סביב כלי דם ולימפה ובעור (דרמיס). + לרקמות אלה היא מעניקה תמיכה והזנה. היא מכילה פיברובלסטים, אשר מכתיבים את אופייה + בייצור חלבוני הסיבים, ותאי דם לבנים (מקרופאגים, תאי פיטום, לימפוציטים ועוד), המספקים + הגנה ראשונית מפני זיהומים על ידי זיהוי הפולשים ונטרולם. התאים והסיבים מסודרים + ברקמה בצורה רופפת ומפוזרת. כמו כן, היא עשירה במים, דבר המקנה לה גמישות. דוגמה + לרקמה כזו היא מתלה המעי. + + רקמת חיבור צפופה מכילה מעט תאים, אך סיבי קולגן רבים. היא מחולקת לשתי קבוצות: רקמת + חיבור צפופה סדירה (רגולרית) ולא סדירה (אי-רגולרית). רקמת החיבור הסדירה מכילה צברים + צפופים של סיבים, במיוחד קולגן, המסודרים בכיוון מקבילי וברור, ואין בה תאים רבים. + תפקידה לתת חוזק וגמישות. היא נמצאת בגידים, ברצועות, באלל (אפונוירוזה), בקרנית, + בקרום הקשה ובגלימה הלבנה באשכים. רקמת החיבור הלא סדירה מכילה צברים של קולגן בכיוונים + אקראיים. היא נמצאת בבסיס צינורות מערכת העיכול, בעור (דרמיס) וסביב שרירי השלד, + בקופסיות המפרקים ובאיברים נמתחים אחרים. תפקידה למנוע מתיחה יתר על המידה ולספק + דם ועצבוב לאיברים אלה.' + - "passage: הקלות בזכאות לדמי אבטלה שנקבעו החל מיולי 2021 בתקופת משבר הקורונה\n\ + \ \nקבלת דמי אבטלה במקביל לקצבאות מסוימות\nסכום דמי האבטלה יילקח בחשבון כהכנסה\ + \ מעבודה בקביעת הזכאות לקצבאות הבאות (אך לא יופחת מסכום הקצבה למי שזכאים לה):\n\ + \nמי שעונים על תנאי הזכאות של דמי אבטלה ושל קצבה אחרת יכולים לבדוק את האפשרות\ + \ לקבל את שני התשלומים במקביל במחשבון המוסד לביטוח לאומי.\nקצבת מזונות\nהבטחת\ + \ הכנסה\nקצבת אזרח ותיק (לנשים עד גיל 67)\nקצבת שאירים\nקצבת נכות כללית (ותוספת\ + \ תלויים לקצבה)\n\nגורמים מסייעים\nגורמי ממשל\nהמוסד לביטוח לאומי" + - 'passage: מוזיאון הרכבת התחתית + + מוזיאון הרכבת התחתית נפתח בשנת 1975 ליד תחנת Deák Ferenc tér, שהיא התחנה היחידה + בה עוצרים שלושת קווי הרכבת. המוזיאון נבנה בקטע נטוש של מנהרת הרכבת התחתית אשר + שימשה עד שנת 1955 כדפו של הקו היחיד דאז, קו M1. תצוגות המוזיאון מתארות את העבר, + ההווה והעתיד של הרכבת התחתית של בודפשט ומתמקדות בעיקר בקו המילניום. במוזיאון מוצג + קטע משומר של תחנת Gizella tér (כיום Vörösmarty tér) כפי שנראתה לפני תחילת השיפוצים + של 1973. בחלק אחר של המוזיאון מוצגת ההיסטוריה של קו המילניום באנגלית ובהונגרית. + כאן נמצאות גם רכבות מקוריות ששירתו בקו המילניום ואפילו קטע קצה מסילה הכולל ציוד + מקורי שהובא במיוחד מהדפו הישן של קו M1 בתחנת Artézi fürdő (כיום Széchenyi fürdő). + ציוד זה הובא מהתחנה כאשר הפסיקה לשמש כתחנת קצה, וזאת לאחר בניית תחנה נוספת בקו + - תחנת Mexikói út. בתצוגת קבע נוספת במוזיאון מוצגים דגמים מפורטים של קרונות רכבת + שונים מכל קווי הרכבת התחתית. בנוסף מוצגים דגמים נוספים הקשורים ברכבת, כמו למשל + קטעי מנהרה וכן הסברים, מסמכים, תצלומים וחפצים מקוריים שיצאו משימוש.' +- source_sentence: 'query: מהן הוועדות המיוחדות שיוקמו בכנסת?' + sentences: + - 'passage: רקע ומהות + + ז''אנר זה חופשי מאוד ומאפשר נימה אישית, צינית או הומוריסטית, בהתאם לרצונו של הכותב. + מאמרים פובליציסטיים הם לרוב ארוכים יחסית ומנומקים. בדרך כלל הם עוסקים בפוליטיקה + או בנושא אחר שעל סדר היום הציבורי. כיוון שהפובליציסטיקה עוסקת בענייני דיומא, היא + מתפרסמת בעיקר בעיתונים היומיים ובשבועונים. + + מאמרים פובליציסטיים נכתבים בעיקר על ידי עיתונאים שמקצועם בכך, חברי מערכת העיתון, + אך גם על ידי אישים שונים, הכותבים בעיקר בנושאים הקשורים לתחומי התמחותם. בין הפובליציסטים + יש העוסקים רק בכך, ויש העושים זאת בנוסף לכתיבת דיווחי חדשות. בין הפובליציסטים, + רבים הם בעלי טור קבוע, ויש המפרסמים את מאמריהם באופן פחות סדיר, בעמודי הדעות של + העיתון.' + - "passage: נכון. אם ירצו להביא את זה, זה יהיה על סדר-היום. יש לנו זמן. \n\nיעקב\ + \ אשר (יהדות התורה): \n\n- - - \n\nהיו\"ר עידית סילמן: \n\nחבר הכנסת יעקב\ + \ אשר, פעם שנייה. \n\nארבל אסטרחן: \n\nתכליתה של הוועדה המסדרת היא להקים את\ + \ ועדות הכנסת הקבועות. זו הצעתנו לוועדה המסדרת לפעול. עלו פה כל מיני הצעות אחרות.\ + \ אנחנו נשאלים האם יש מניעה לעשות את זה. לכן גם אם זה לא הדבר הנכון והמתאים ביותר-\ + \ - -\n\nדוד ביטן (הליכוד): \n\nאי –אפשר לשלוח למשהו ערטילאי.\n\nארבל אסטרחן:\ + \ \n\nלמשל, צריך לזכור שהכנסת ה-21 הקימה רק ועדת חינוך. יתר הוועדות לא קמו.\ + \ הכנסת ה-22 הקימה רק ועדת כנסת. \n\nאבי מעוז (הציונות הדתית): \n\nהיתה ממשלת\ + \ מעבר.\n\nהיו\"ר עידית סילמן: \n\nחבר הכנסת אבי מעוז, פעם ראשונה. \n\nאבי מעוז\ + \ (הציונות הדתית): \n\nהדוגמאות שאת מביאה – הן לא ממין העניין. \n\nאופיר כץ\ + \ (הליכוד): \n\n- - - \n\nהיו\"ר עידית סילמן: \n\nאתה תצא מהאולם, חבר הכנסת\ + \ אופיר כץ. שלא תגיד שלא אמרתי.\n\nחבר הכנסת אבי מעוז, אתה לא מפריע. יפה. חברת\ + \ הכנסת עידית סילמן, יושבת-ראש הוועדה המסדרת, כרגע נשארת בהחלטתה הראשונה שהביאה\ + \ אותה לדיון שאנחנו ממשיכים בנושא של מה שהובא לפה לדיון, להעביר לוועדת חוקה, חוק\ + \ ומשפט לכשתקום הוועדה הקבועה. חבר הכנסת יבגני סובה, בבקשה." + - 'passage: החקירה המשפטית + + מרגע שנתקבל אישור מהממשלה לפתוח בחקירה ממשלתית, העביר מרסייה את התיק לידי הגנרל + פליקס גוסטב סוסייה, המושל הצבאי של פריז. לסוסייה הייתה יריבות אישית ארוכה עם מרסייה, + והוא לא היה להוט לפתוח בחקירה. אולם לא הייתה לו ברירה, וב-3 בנובמבר הסמיך את הקומנדנט + בקסון ד''אומרשוויל לבצע את החקירה. חקירתו של ד''אומרשוויל הייתה רשלנית ומוטית, + ונעשתה מתוך אמונה ראשונית שדרייפוס אכן אשם. הוא התעלם מכל החוקרים אשר טענו כי + אין דמיון בין הכתב בבורדרו לכתב ידו של דרייפוס, ובחר לקבל רק את הדעות הנגדיות. + לעיתים קרובות הוא חזר בתחקירים שלו מילה במילה על תחקיריו של פאטי. חלק נכבד מהתחקירים + יוחדו לנשים אשר דרייפוס היה עמן בקשר. ד''אומרשוויל היה מודע לכך שהראיות נגד דרייפוס + מעטות מאוד, והוא לא היסס לנצל אפילו עובדה זו כדי לטעון שדרייפוס אכן אשם. כך לדוגמה, + הוא טען שהעובדה שבביתו של דרייפוס נמצאו מעט מאוד מכתבים, מעידה על כך שדרייפוס + השמיד את כל המכתבים החשודים. ד''אומרשוויל התייעץ עם פאטי בכל שלב משלבי החקירה, + עד כדי כך שהדבר נמאס אפילו על אנשי צוותו. ב-3 בדצמבר הגיש ד''אומרשוויל את הדו"ח + שלו לסוסייה, והמליץ בו על פתיחת משפט צבאי כנגד דרייפוס. סוסייה התנגד עקרונית למהלך, + והיה מסוגל להורות על ביטול החקירה, אולם לא עשה זאת, ככל הנראה מתוך אמונה בצדק + הצבאי. למחרת הוא הוציא את ההוראה להעמיד את דרייפוס למשפט צבאי בפריז וקבע את מועד + תחילת המשפט ל-19 בדצמבר.' +- source_sentence: 'query: האם לעובדת בקיצור שעות של אימהות ניתן גם קיצור בגלל צו + 8?' + sentences: + - "passage: סגן השר לביטחון הפנים יואב סגלוביץ': \n\nאני מודה לכולם – גם אם יכולנו\ + \ לעלות על דרך מלך טובה יותר – אני מבטיח שהחוק הזה יוגש לפני תום המועד, אני אדאג\ + \ לכך וגם אגיע לאותם הדיונים באותה הוועדה שתיקבע. נכנסתי כעת לרשות המבצעת אבל\ + \ אני יודע בדיוק מה תפקידה של הכנסת ומה החשיבות של פיקוח פרלמנטרי. אעשה כל שצריך\ + \ על מנת שהחוק יהיה מאוזן על מנת שיהיה חוק טוב יותר. נבצע פה דיון עומק כי יש לבצע\ + \ דיון עומק בחוקים כאלו. מודה שוב לכולם על ההסכמה. הלא מובנת מאליה בתקופה המשונה\ + \ שעוברת עלינו. חבר הכנסת ליצמן נתן לי מחמאה לא מזמן, הוא אמר לי שאני צעיר, נאיבי\ + \ ותמים. אז קניתי את זה. \n\nמשה גפני (יהדות התורה): \n\nליצמן אמר את זה? אני\ + \ אלך לדבר איתו. \n\nסגן השר לביטחון הפנים יואב סגלוביץ': \n\nלקחתי את זה כמחמאה\ + \ וקחו את הקטע הזה גם בהתאם לדיונים שיהיו בוועדה ולגבי החוק בכלל, תודה רבה.\n\n\ + היו\"ר אלכס קושניר: \n\nחברים, אני מבקש להצביע על חוק לייעול האכיפה והפיקוח\ + \ העירוניים ברשויות המקומיות (הוראת שעה) (תיקון מס – יבוא בהמשך) (הפיכה להוראת\ + \ קבע), התשפ\"א–2021. אני רוצה לומר שיהיו במליאה רשויות דיבור לסיעת הליכוד, ש\"\ + ס, יהדות התורה, והציונות הדתית והרשימה המשותפת למשך שלוש שעות ביחד. מי בעד החוק?\ + \ \n\nהצבעה\n\nאושר" + - 'passage: טקס פרסי האקדמיה הבריטית לקולנוע ה-45 שמעניקה האקדמיה הבריטית לאמנויות + הקולנוע והטלוויזיה, נערך בשנת 1992, בו הוענקו פרסים לסרטים הטובים ביותר של שנת + 1991. + + סרטו של אלן פארקר "הקומיטמנטס" זכה בפרס הסרט הטוב ביותר, הבמאי, התסריט המעובד + והעריכה. הסרט "שתיקת הכבשים" זיכה את אנתוני הופקינס בפרס השחקן הטוב ביותר ואת + ג''ודי פוסטר בפרס השחקנית הטובה ביותר. הסרט "שתיקת הכבשים" זכה גם בפרס אוסקר. + אלן ריקמן זכה בפרס שחקן המשנה על הופעתו בסרט "רובין הוד: נסיך הגנבים". קייט נליגן + זכתה בפרס שחקנית המשנה על משחקה בסרט "פרנקי וג''וני".' + - "passage: יום עבודה ושבוע עבודה\n \nשבוע עבודה מלא\nמקובל להתייחס לשבוע כפרק זמן\ + \ המתחיל ביום א' ומסתיים במוצאי שבת בחצות.\nלפי צו הרחבה בדבר קיצור שבוע העבודה\ + \ במשק ל-42 שעות שבועיות, שבוע העבודה קוצר ל- 42 שעות (במקום 43). הקיצור חל גם\ + \ על עובדים שעובדים יותר מ-42 שעות ופחות מ-43 שעות.\nקיצור שבוע העבודה ל-42 שעות\ + \ שבועיות חל רק על עובדים במשרה מלאה שתקן שעות העבודה שלהם גדול מ-42 שעות.\nהיקף\ + \ המשרה של עובדים במשרה חלקית יחושב בהתאמה לפי 42 שעות שבועיות או 182 שעות חודשיות\ + \ (ראו דוגמה למעלה).\nבהתאמה לקיצור שבוע העבודה, שכר שעה לעובד במשכורת חודשית\ + \ יחושב לפי בסיס של 182 שעות עבודה בחודש (במקום 186 שעות).\nשבוע העבודה של עובד\ + \ המועסק בעבודת לילה לא יעלה על 58 שעות, כולל שעות נוספות.\n\nחודש עבודה\nמאחר\ + \ שיש חודשים קצרים יותר וחודשים ארוכים יותר, שעות התקן החודשיות משתנות מחודש לחודש.\n\ + חודש עבודה לצורך חישוב ערך שעת עבודה של עובד המועסק במשרה מלאה או לצורך חישוב\ + \ היקף משרה של עובד הוא 182 שעות.\nעובד שבכל יום עבודה באותו חודש עבד את מלוא\ + \ שעות התקן היומי, ייחשב כמי שעבד משרה מלאה באותו חודש ומילא את תקן השעות החודשי.\n\ + \nחישוב היקף המשרה לעובד לפי שעות\nכדי לחשב את היקף המשרה של עובד לפי שעות ראו\ + \ חישוב היקף משרה של עובד לפי שעות.\n\nהרחבות\nענף: עובדי חברות שמירה ואבטחה\n\ + \nהרחבה\n: סעיף 6 לצו הרחבה בענף השמירה והאבטחה 2014 קובע כי שבוע העבודה בענף\ + \ זה הוא של 5 ימים (בהתאם לצו ההרחבה בדבר קיצור שבוע העבודה, מאפריל 2018 שבוע\ + \ העבודה הוא 42 שעות).\n\nהרחבות\nענף: עובדי אולמות וגני אירועים\n\nהרחבה\n: \n\ + אורך משמרת לילה הוא 6 שעות, ולכן זכאי עובד בענף זה לגמול שעות נוספות מהשעה השביעית.\n\ + למידע נוסף ראו גמול עבור שעות נוספות במשמרת לילה לעובדי אולמות וגני אירועים." +pipeline_tag: sentence-similarity +library_name: sentence-transformers +metrics: +- pearson_cosine +- spearman_cosine +model-index: +- name: SentenceTransformer based on intfloat/multilingual-e5-large + results: + - task: + type: semantic-similarity + name: Semantic Similarity + dataset: + name: sts dev + type: sts-dev + metrics: + - type: pearson_cosine + value: 0.4256363241599687 + name: Pearson Cosine + - type: spearman_cosine + value: 0.4259303069483182 + name: Spearman Cosine +--- + +# SentenceTransformer based on intfloat/multilingual-e5-large + +This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) on the train_dataset, HebNLI, HebQA, RAGbot and ParaShoot datasets. It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more. + +## Model Details + +### Model Description +- **Model Type:** Sentence Transformer +- **Base model:** [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) +- **Maximum Sequence Length:** 512 tokens +- **Output Dimensionality:** 1024 dimensions +- **Similarity Function:** Cosine Similarity +- **Training Datasets:** + - train_dataset + - HebNLI + - HebQA + - RAGbot + - ParaShoot + + + +### Model Sources + +- **Documentation:** [Sentence Transformers Documentation](https://sbert.net) +- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers) +- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers) + +### Full Model Architecture + +``` +SentenceTransformer( + (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'XLMRobertaModel'}) + (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True}) + (2): Normalize() +) +``` + +## Usage + +### Direct Usage (Sentence Transformers) + +First install the Sentence Transformers library: + +```bash +pip install -U sentence-transformers +``` + +Then you can load this model and run inference. +```python +from sentence_transformers import SentenceTransformer + +# Download from the 🤗 Hub +model = SentenceTransformer("sentence_transformers_model_id") +# Run inference +sentences = [ + 'query: האם לעובדת בקיצור שעות של אימהות ניתן גם קיצור בגלל צו 8?', + "passage: יום עבודה ושבוע עבודה\n \nשבוע עבודה מלא\nמקובל להתייחס לשבוע כפרק זמן המתחיל ביום א' ומסתיים במוצאי שבת בחצות.\nלפי צו הרחבה בדבר קיצור שבוע העבודה במשק ל-42 שעות שבועיות, שבוע העבודה קוצר ל- 42 שעות (במקום 43). הקיצור חל גם על עובדים שעובדים יותר מ-42 שעות ופחות מ-43 שעות.\nקיצור שבוע העבודה ל-42 שעות שבועיות חל רק על עובדים במשרה מלאה שתקן שעות העבודה שלהם גדול מ-42 שעות.\nהיקף המשרה של עובדים במשרה חלקית יחושב בהתאמה לפי 42 שעות שבועיות או 182 שעות חודשיות (ראו דוגמה למעלה).\nבהתאמה לקיצור שבוע העבודה, שכר שעה לעובד במשכורת חודשית יחושב לפי בסיס של 182 שעות עבודה בחודש (במקום 186 שעות).\nשבוע העבודה של עובד המועסק בעבודת לילה לא יעלה על 58 שעות, כולל שעות נוספות.\n\nחודש עבודה\nמאחר שיש חודשים קצרים יותר וחודשים ארוכים יותר, שעות התקן החודשיות משתנות מחודש לחודש.\nחודש עבודה לצורך חישוב ערך שעת עבודה של עובד המועסק במשרה מלאה או לצורך חישוב היקף משרה של עובד הוא 182 שעות.\nעובד שבכל יום עבודה באותו חודש עבד את מלוא שעות התקן היומי, ייחשב כמי שעבד משרה מלאה באותו חודש ומילא את תקן השעות החודשי.\n\nחישוב היקף המשרה לעובד לפי שעות\nכדי לחשב את היקף המשרה של עובד לפי שעות ראו חישוב היקף משרה של עובד לפי שעות.\n\nהרחבות\nענף: עובדי חברות שמירה ואבטחה\n\nהרחבה\n: סעיף 6 לצו הרחבה בענף השמירה והאבטחה 2014 קובע כי שבוע העבודה בענף זה הוא של 5 ימים (בהתאם לצו ההרחבה בדבר קיצור שבוע העבודה, מאפריל 2018 שבוע העבודה הוא 42 שעות).\n\nהרחבות\nענף: עובדי אולמות וגני אירועים\n\nהרחבה\n: \nאורך משמרת לילה הוא 6 שעות, ולכן זכאי עובד בענף זה לגמול שעות נוספות מהשעה השביעית.\nלמידע נוסף ראו גמול עבור שעות נוספות במשמרת לילה לעובדי אולמות וגני אירועים.", + 'passage: סגן השר לביטחון הפנים יואב סגלוביץ\': \n\nאני מודה לכולם – גם אם יכולנו לעלות על דרך מלך טובה יותר – אני מבטיח שהחוק הזה יוגש לפני תום המועד, אני אדאג לכך וגם אגיע לאותם הדיונים באותה הוועדה שתיקבע. נכנסתי כעת לרשות המבצעת אבל אני יודע בדיוק מה תפקידה של הכנסת ומה החשיבות של פיקוח פרלמנטרי. אעשה כל שצריך על מנת שהחוק יהיה מאוזן על מנת שיהיה חוק טוב יותר. נבצע פה דיון עומק כי יש לבצע דיון עומק בחוקים כאלו. מודה שוב לכולם על ההסכמה. הלא מובנת מאליה בתקופה המשונה שעוברת עלינו. חבר הכנסת ליצמן נתן לי מחמאה לא מזמן, הוא אמר לי שאני צעיר, נאיבי ותמים. אז קניתי את זה. \n\nמשה גפני (יהדות התורה): \n\nליצמן אמר את זה? אני אלך לדבר איתו. \n\nסגן השר לביטחון הפנים יואב סגלוביץ\': \n\nלקחתי את זה כמחמאה וקחו את הקטע הזה גם בהתאם לדיונים שיהיו בוועדה ולגבי החוק בכלל, תודה רבה.\n\nהיו"ר אלכס קושניר: \n\nחברים, אני מבקש להצביע על חוק לייעול האכיפה והפיקוח העירוניים ברשויות המקומיות (הוראת שעה) (תיקון מס – יבוא בהמשך) (הפיכה להוראת קבע), התשפ"א–2021. אני רוצה לומר שיהיו במליאה רשויות דיבור לסיעת הליכוד, ש"ס, יהדות התורה, והציונות הדתית והרשימה המשותפת למשך שלוש שעות ביחד. מי בעד החוק? \n\nהצבעה\n\nאושר', +] +embeddings = model.encode(sentences) +print(embeddings.shape) +# [3, 1024] + +# Get the similarity scores for the embeddings +similarities = model.similarity(embeddings, embeddings) +print(similarities) +# tensor([[1.0000, 0.8013, 0.7637], +# [0.8013, 1.0000, 0.7747], +# [0.7637, 0.7747, 1.0000]]) +``` + + + + + + + +## Evaluation + +### Metrics + +#### Semantic Similarity + +* Dataset: `sts-dev` +* Evaluated with [EmbeddingSimilarityEvaluator](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator) + +| Metric | Value | +|:--------------------|:-----------| +| pearson_cosine | 0.4256 | +| **spearman_cosine** | **0.4259** | + + + + + +## Training Details + +### Training Datasets +
train_dataset + +#### train_dataset + +* Dataset: train_dataset +* Size: 40,680 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:-----------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|:---------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 10 tokens
  • mean: 20.16 tokens
  • max: 48 tokens
|
  • min: 29 tokens
  • mean: 329.02 tokens
  • max: 512 tokens
|
  • min: 0.0
  • mean: 0.31
  • max: 1.0
| +* Samples: + | sentence1 | sentence2 | score | + |:----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------| + | query: מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| passage: דרגות השתתפות במימון מעונות יום ומשפחתונים

גובה שכר הלימוד
סכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.
הטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:

גובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.
הדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.
ככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.
עבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).
תינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמבר של שנת הלימודים אליה נרשמו.
ילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).

השפעת מספר הילדים השוהים במעון/משפחתון על דר...
| 0.8 | + | query: מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| passage: סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים

הורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)



הורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים
כדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.

גובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.
הורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.
לפרטים ומי...
| 0.7 | + | query: מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| passage: השתתפות במימון מעונות יום ומשפחתונים
תהליך מימוש הזכות
איתור מסגרת מוכרת
מערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.
ניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.

התשלום למעון/משפחתון
עבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).
החזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.
השתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):

הוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).
הוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.

היעדרות של הילד
היעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.
היעד...
| 0.8 | +* Loss: [CoSENTLoss](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosentloss) with these parameters: + ```json + { + "scale": 20.0, + "similarity_fct": "pairwise_cos_sim" + } + ``` +
+
HebNLI + +#### HebNLI + +* Dataset: HebNLI +* Size: 303,034 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:-----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:-----------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 8 tokens
  • mean: 34.67 tokens
  • max: 155 tokens
|
  • min: 7 tokens
  • mean: 18.75 tokens
  • max: 47 tokens
|
  • min: 0.05
  • mean: 0.47
  • max: 0.91
| +* Samples: + | sentence1 | sentence2 | score | + |:------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|:---------------------------------| + | query: אז בכל מקרה, אולי הוא ישים את ווילי הורטון הזה על הכרטיס אחרי הכול. | passage: לא הרבה אנשים יודעים מי זה ווילי הורטון. | 0.3812697383602913 | + | query: האם ניתנה תשומת לב מספקת לערכים חריגים? | passage: לסטיות לא צריך לתת שום תשומת לב, נכון? | 0.31741741461323636 | + | query: כפי שסוכם עם המשתתפים, מטרת הדיון לא הייתה להגיע לקונצנזוס, אלא לעסוק בדיאלוג פתוח ללא ייחוס. | passage: הוסכם עם המשתתפים לוודא שהדיון יכלול שיח פתוח. | 0.5154610986925753 | +* Loss: [CoSENTLoss](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosentloss) with these parameters: + ```json + { + "scale": 20.0, + "similarity_fct": "pairwise_cos_sim" + } + ``` +
+
HebQA + +#### HebQA + +* Dataset: HebQA +* Size: 60,294 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:-----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------|:-----------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 10 tokens
  • mean: 18.47 tokens
  • max: 36 tokens
|
  • min: 165 tokens
  • mean: 244.67 tokens
  • max: 493 tokens
|
  • min: 0.01
  • mean: 0.67
  • max: 0.99
| +* Samples: + | sentence1 | sentence2 | score | + |:-------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------| + | query: כמה אנשים היו בוועדה שניסחה את הטיוטה הרביעית? | passage: לאחר שהטיוטות הראשוניות הובאו למנהלת העם וספגו ביקורת חריפה, הוקמה ועדת חמישה בראשות משה שרת שניסחה טיוטה רביעית. ב-13 במאי לפנות ערב הוגשה הצעתה למנהלת העם. שרת עשה את רוב המלאכה, תוך התייעצות עם משפטנים מומחים. נוסח ההכרזה הושאל מכתב המנדט וממסמכים משפטיים רבים אחרים, וכל סעיף שלו התחיל במילים "הואיל ו...". בהצעה זו הוזכרה גם תוכנית החלוקה של האו"ם. מזכיר מנהלת העם כתב כי "בן-גוריון התנגד ל'הואיל' כי אינו עברי" וכן "התנגד למילים 'ישוב רב־איל', 'עוז וגבורה' וכיוצא באלה". כמו כן, התנגד להזכרה מפורשת של תוכנית החלוקה. | 0.5564618997042421 | + | query: כמה אנשים היו בוועדה שניסחה את הטיוטה הרביעית? | passage: לאחר שהטיוטות הראשוניות הובאו למנהלת העם וספגו ביקורת חריפה, הוקמה ועדת חמישה בראשות משה שרת שניסחה טיוטה רביעית. ב-13 במאי לפנות ערב הוגשה הצעתה למנהלת העם. שרת עשה את רוב המלאכה, תוך התייעצות עם משפטנים מומחים. נוסח ההכרזה הושאל מכתב המנדט וממסמכים משפטיים רבים אחרים, וכל סעיף שלו התחיל במילים "הואיל ו...". בהצעה זו הוזכרה גם תוכנית החלוקה של האו"ם. מזכיר מנהלת העם כתב כי "בן-גוריון התנגד ל'הואיל' כי אינו עברי" וכן "התנגד למילים 'ישוב רב־איל', 'עוז וגבורה' וכיוצא באלה". כמו כן, התנגד להזכרה מפורשת של תוכנית החלוקה. | 0.99 | + | query: באילו מילים התחילו כל סעיפי הטויוטה? | passage: לאחר שהטיוטות הראשוניות הובאו למנהלת העם וספגו ביקורת חריפה, הוקמה ועדת חמישה בראשות משה שרת שניסחה טיוטה רביעית. ב-13 במאי לפנות ערב הוגשה הצעתה למנהלת העם. שרת עשה את רוב המלאכה, תוך התייעצות עם משפטנים מומחים. נוסח ההכרזה הושאל מכתב המנדט וממסמכים משפטיים רבים אחרים, וכל סעיף שלו התחיל במילים "הואיל ו...". בהצעה זו הוזכרה גם תוכנית החלוקה של האו"ם. מזכיר מנהלת העם כתב כי "בן-גוריון התנגד ל'הואיל' כי אינו עברי" וכן "התנגד למילים 'ישוב רב־איל', 'עוז וגבורה' וכיוצא באלה". כמו כן, התנגד להזכרה מפורשת של תוכנית החלוקה. | 0.4362805487252798 | +* Loss: [CoSENTLoss](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosentloss) with these parameters: + ```json + { + "scale": 20.0, + "similarity_fct": "pairwise_cos_sim" + } + ``` +
+
RAGbot + +#### RAGbot + +* Dataset: RAGbot +* Size: 7,780 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:-----------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|:----------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 8 tokens
  • mean: 24.34 tokens
  • max: 135 tokens
|
  • min: 69 tokens
  • mean: 420.78 tokens
  • max: 512 tokens
|
  • min: 0.04
  • mean: 0.8
  • max: 0.99
| +* Samples: + | sentence1 | sentence2 | score | + |:---------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------| + | query: לא ברור באתר
אם על חברות ביטוח פרטיות
חל חוק חופש המידע
אני מציע שזה יהיה כתוב מפורש
| passage: חוק חופש המידע, התשנ"ח-1998 מאפשר לכל אזרח או תושב ישראל (/he/תושב_ישראל) לפנות בבקשה לרשות ציבורית לקבל מידע אותו היא מחזיקה. מטרתו הראשית של החוק היא להטמיע תפיסת יסוד של שיתוף במידע הציבורי.
פרטים

שם החוק:חוק חופש המידע, התשנ"ח-1998
קישור:החוק באתר נבו
שר אחראי:שר המשפטים
החוק ב"ספר החוקים הפתוח"
נושאים וזכויות
חופש המידע
הזכות לפרטיות
| 0.502234399276465 | + | query: לא ברור באתר
אם על חברות ביטוח פרטיות
חל חוק חופש המידע
אני מציע שזה יהיה כתוב מפורש
| passage: חוק חופש המידע, התשנ"ח-1998 מאפשר לכל אזרח או תושב ישראל (/he/תושב_ישראל) לפנות בבקשה לרשות ציבורית לקבל מידע אותו היא מחזיקה. מטרתו הראשית של החוק היא להטמיע תפיסת יסוד של שיתוף במידע הציבורי.
פרטים

שם החוק:חוק חופש המידע, התשנ"ח-1998
קישור:החוק באתר נבו
שר אחראי:שר המשפטים
החוק ב"ספר החוקים הפתוח"
נושאים וזכויות
חופש המידע
הזכות לפרטיות
| 0.99 | + | query: לא ברור באתר
אם על חברות ביטוח פרטיות
חל חוק חופש המידע
אני מציע שזה יהיה כתוב מפורש
| passage: כל אזרח או תושב ישראל זכאי לקבלת מידע מרשות ציבורית (https://www.gov.il/he/departments/general/list_of_authorities)



לצורך קבלת המידע יש לפנות לממונה על יישום חוק חופש המידע ברשות הציבורית



לפני הגשת בקשה לקבלת מידע, מומלץ לברר אם המידע כבר קיים במאגר התשובות (https://www.gov.il/he/departments/general/answer_reservoir) של היחידה הממשלתית לחופש המידע במשרד המשפטים



הגשת הבקשה כרוכה בתשלום אגרה, אך במקרים מסוימים ניתן פטור מתשלום האגרה



למידע נוסף ראו הגשת בקשת חופש המידע (https://www.gov.il/he/service/freedom_of_information_submission) באתר היחידה הממשלתית לחופש המידע במשרד המשפטים
חוק חופש המידע קובע כי כל אזרח או תושב ישראל זכאי לקבל מידע מרשות ציבורית.

לרשימה המלאה של הרשויות הציבוריות שעליהן חל החוק ראו כאן.

אוכלוסיית יעד ותנאים מקדימים
כל אזרח או תושב ישראל.

למי ואיך פונים
יש לבדוק איזו רשות אחראית על הנושא שלגביו מבוקש המידע.
יש לפנות לממונה על יישום חוק חופש המידע ברשות האחראית:

לרשימת הממונים על יישום חוק חופש המידע ברשויות השונות באתר היחידה הממשלתית לח...
| 0.4094745764349462 | +* Loss: [CoSENTLoss](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosentloss) with these parameters: + ```json + { + "scale": 20.0, + "similarity_fct": "pairwise_cos_sim" + } + ``` +
+
ParaShoot + +#### ParaShoot + +* Dataset: ParaShoot +* Size: 6,076 training samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:----------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------|:---------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 9 tokens
  • mean: 18.41 tokens
  • max: 44 tokens
|
  • min: 167 tokens
  • mean: 259.73 tokens
  • max: 512 tokens
|
  • min: 0.0
  • mean: 0.79
  • max: 1.0
| +* Samples: + | sentence1 | sentence2 | score | + |:-----------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------| + | query: במה בתחילת דרכה עסקה חברתו של אהרון רוזנפלד? | passage: בתחילה עסקה חברתו מכל הבא ליד אך עם הגברת הבנייה בשנות ה-20 יצר רוזנפלד קשרים בבלגיה, ייבא משם חומרי בניין (בעיקר זכוכית) וייצג את חברת הספנות הבלגית שהביאה את החומרים הללו. כך עגנה ב-1923 בחיפה האונייה הבלגית הראשונה. רוזנפלד היה גם סוכנן של חברות ספנות ואוניות אמריקאיות. בשל פיתוח יחסי העסקים עם בלגיה התמנה ב-1929 לסגן-קונסול וב-1934 לקונסול בלגיה וליבריה בחיפה. ב-1932 קיבל מאלברט הראשון מלך בלגיה את אות הכבוד של אביר מסדר הכתר הבלגי . רוזנפלד השתתף בהקמתם של המרכז המסחרי הישן והחדש בעיר התחתית, והיה ממקימי "בנק בעלי הבתים בע"מ" ב-15 ביולי 1931. רוזנפלד היה חבר בארגונים חברתיים ועסקיים שונים, ובין השאר היה נשיא הבונים החופשיים, חבר ועדת הנמל בלשכת המסחר והתעשייה העברית ולשכת הספנות החיפאית (מ"מ נשיא לשכת הספנות באמצע שנות ה-60) ואזרח כבוד של חיפה. | 0.545198086492799 | + | query: במה בתחילת דרכה עסקה חברתו של אהרון רוזנפלד? | passage: בתחילה עסקה חברתו מכל הבא ליד אך עם הגברת הבנייה בשנות ה-20 יצר רוזנפלד קשרים בבלגיה, ייבא משם חומרי בניין (בעיקר זכוכית) וייצג את חברת הספנות הבלגית שהביאה את החומרים הללו. כך עגנה ב-1923 בחיפה האונייה הבלגית הראשונה. רוזנפלד היה גם סוכנן של חברות ספנות ואוניות אמריקאיות. בשל פיתוח יחסי העסקים עם בלגיה התמנה ב-1929 לסגן-קונסול וב-1934 לקונסול בלגיה וליבריה בחיפה. ב-1932 קיבל מאלברט הראשון מלך בלגיה את אות הכבוד של אביר מסדר הכתר הבלגי . רוזנפלד השתתף בהקמתם של המרכז המסחרי הישן והחדש בעיר התחתית, והיה ממקימי "בנק בעלי הבתים בע"מ" ב-15 ביולי 1931. רוזנפלד היה חבר בארגונים חברתיים ועסקיים שונים, ובין השאר היה נשיא הבונים החופשיים, חבר ועדת הנמל בלשכת המסחר והתעשייה העברית ולשכת הספנות החיפאית (מ"מ נשיא לשכת הספנות באמצע שנות ה-60) ואזרח כבוד של חיפה. | 0.99 | + | query: מה אירע בחיפה לראשונה בשנת 1923? | passage: בתחילה עסקה חברתו מכל הבא ליד אך עם הגברת הבנייה בשנות ה-20 יצר רוזנפלד קשרים בבלגיה, ייבא משם חומרי בניין (בעיקר זכוכית) וייצג את חברת הספנות הבלגית שהביאה את החומרים הללו. כך עגנה ב-1923 בחיפה האונייה הבלגית הראשונה. רוזנפלד היה גם סוכנן של חברות ספנות ואוניות אמריקאיות. בשל פיתוח יחסי העסקים עם בלגיה התמנה ב-1929 לסגן-קונסול וב-1934 לקונסול בלגיה וליבריה בחיפה. ב-1932 קיבל מאלברט הראשון מלך בלגיה את אות הכבוד של אביר מסדר הכתר הבלגי . רוזנפלד השתתף בהקמתם של המרכז המסחרי הישן והחדש בעיר התחתית, והיה ממקימי "בנק בעלי הבתים בע"מ" ב-15 ביולי 1931. רוזנפלד היה חבר בארגונים חברתיים ועסקיים שונים, ובין השאר היה נשיא הבונים החופשיים, חבר ועדת הנמל בלשכת המסחר והתעשייה העברית ולשכת הספנות החיפאית (מ"מ נשיא לשכת הספנות באמצע שנות ה-60) ואזרח כבוד של חיפה. | 0.40662873982648273 | +* Loss: [CoSENTLoss](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosentloss) with these parameters: + ```json + { + "scale": 20.0, + "similarity_fct": "pairwise_cos_sim" + } + ``` +
+ +### Evaluation Dataset + +#### Unnamed Dataset + +* Size: 40,680 evaluation samples +* Columns: sentence1, sentence2, and score +* Approximate statistics based on the first 1000 samples: + | | sentence1 | sentence2 | score | + |:--------|:-----------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|:---------------------------------------------------------------| + | type | string | string | float | + | details |
  • min: 10 tokens
  • mean: 20.16 tokens
  • max: 48 tokens
|
  • min: 29 tokens
  • mean: 329.02 tokens
  • max: 512 tokens
|
  • min: 0.0
  • mean: 0.31
  • max: 1.0
| +* Samples: + | sentence1 | sentence2 | score | + |:----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------| + | query: מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| passage: דרגות השתתפות במימון מעונות יום ומשפחתונים

גובה שכר הלימוד
סכומי השתתפות המדינה והסכומים אותם משלימים ההורים עבור כל אחת מהדרגות, מפורטים בטבלאות שכר לימוד.
הטבלאות מחולקות לפי תעריפים לילדים ותעריפים לתינוקות:

גובה שכר הלימוד בהתאם לדרגות השונות מפורטת בטבלאות שכר לימוד במסגרות ילדים באתר משרד העבודה.
הדרגות המופיעות בטבלאות מותאמות לגובה ההכנסה לנפש במשפחה.
ככל שדרגת הזכאות (בטווח 12-3) נמוכה יותר, כך גדלה השתתפות המדינה בתשלום.
עבור רמות ההשתתפות המותאמות לגובה ההכנסות, יש להוסיף או להפחית דרגות, על-פי מספר מספר הילדים השוהים במעון או במשפחתון - למשפחה עם כמה ילדים השוהים במעון או במשפחתון, תוגדל רמת השתתפות המדינה עבור כל אחד מהילדים (תיקבע דרגה נמוכה יותר בין הדרגות 12-3).
תינוקות הם מי שגילם היה עד 15 חודשים ב-1 בספטמבר של שנת הלימודים אליה נרשמו.
ילדים הם מי שב-1 בספטמבר של שנת הלימודים אליה נרשמו, גילם היה בין 15 חודשים ויום ל-33 חודשים (או עד לגיל 46 חודשים, למי שיש להם אישור מהיחידה להתפתחות הילד כי הם מעוכבי התפתחות).

השפעת מספר הילדים השוהים במעון/משפחתון על דר...
| 0.8 | + | query: מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| passage: סיוע להורים עצמאיים (הורים יחידים) במימון מעונות יום ומשפחתונים

הורים עצמאיים (הורים יחידים) (/he/הורה_עצמאי_(הורה_יחיד)) עשויים להיות זכאים לסיוע במימון מעונות יום ומשפחתונים לפי דרגות השתתפות (/he/דרגות_השתתפות_במימון_מעונות_יום_ומשפחתונים) שנקבעות בהתאם לרמת ההכנסה שלהם (/he/רמת_ההכנסה_לנפש_לצורך_סיוע_במימון_מעונות_יום)



הורים עצמאיים שעונים על הגדרת מגבירי עבודה (.D7.96.D7.9B.D7.90.D7.95.D7.AA_.D7.A9.D7.9C_.D7.94.D7.95.D7.A8.D7.99.D7.9D_.D7.9E.D7.92.D7.91.D7.99.D7.A8.D7.99_.D7.A2.D7.91.D7.95.D7.93.D7.94) זכאים לסיוע מוגדל וישלמו 250 בחודש לילד אחד ו-375 בחודש לשני ילדים
כדי להקל על שילובם של הורים בשוק העבודה, המדינה מסייעת להם במימון מעונות יום ומשפחתונים מוכרים.

גובה התמיכה בזכאות הכללית של ההורים (זוגות הורים או הורים עצמאיים) נקבע על-פי מספר דרגות השתתפות המתבססות על רמת ההכנסה לנפש במשפחה.
הורים עצמאיים (הורים יחידים) שעונים על הגדרת "מגבירי עבודה" (בהתאם למפורט בהמשך), זכאים לסיוע מוגדל וישלמו רק 250 ₪ בחודש עבור ילד אחד ו-375 ₪ בחודש עבור שני ילדים.
לפרטים ומי...
| 0.7 | + | query: מה התשלום עבור דרגה X במעון או משפחתון?
מה הזכאות עבור ילד נוסף במסגרת מעון או משפחתון?
| passage: השתתפות במימון מעונות יום ומשפחתונים
תהליך מימוש הזכות
איתור מסגרת מוכרת
מערכת אינטרנטית (מקוונת) מאפשרת לאתר משפחתונים, מעונות וצהרונים הנמצאים בפיקוח ממשלתי.
ניתן לבצע חיפוש לפי ישוב, סוג המסגרת או שם המסגרת. לכניסה למערכת לחצו כאן.

התשלום למעון/משפחתון
עבור חודש אוגוסט יחושב תשלום באופן יחסי בהתאם לתקופת שהות הילד במעון (העלות המלאה לחודש כפול מספר החודשים שהילד שהה במעון חלקי 12 חודשים).
החזר כספי בהתאם לדרגת ההשתתפות, יינתן להורים עבור החודש שבו התקבל שאלון ההרשמה במוקד, ועבור החודש שקדם לו.
השתתפות בחודש הראשון לכניסת הילד למסגרת (מעון או משפחתון):

הוריו של ילד שנכנס למסגרת עד ל-15 בחודש (כולל ה-15 לחודש), יהיו זכאים להשתתפות המדינה עבור אותו חודש (על-פי הדרגה שנקבעה להם).
הוריו של ילד שנכנס למסגרת לאחר ה-15 לחודש, לא יהיו זכאים להשתתפות בחודש זה. במקרה זה יחויבו ההורים לשלם דמי החזקה באופן יחסי לימים בהם שהה הילד במסגרת.

היעדרות של הילד
היעדרות מהמעון לתקופה של יותר מ-21 ימים תיחשב כעזיבה, אלא אם כן הוצגו במהלך תקופה זו אישורים רפואיים על מחלה או אשפוז של הילד.
היעד...
| 0.8 | +* Loss: [CoSENTLoss](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosentloss) with these parameters: + ```json + { + "scale": 20.0, + "similarity_fct": "pairwise_cos_sim" + } + ``` + +### Training Hyperparameters +#### Non-Default Hyperparameters + +- `eval_strategy`: steps +- `per_device_train_batch_size`: 16 +- `per_device_eval_batch_size`: 16 +- `learning_rate`: 1e-05 +- `weight_decay`: 0.02 +- `num_train_epochs`: 2 +- `max_steps`: 400 +- `warmup_steps`: 400 +- `seed`: 321 +- `bf16`: True +- `tf32`: True +- `torch_compile`: True +- `torch_compile_backend`: inductor +- `batch_sampler`: no_duplicates + +#### All Hyperparameters +
Click to expand + +- `overwrite_output_dir`: False +- `do_predict`: False +- `eval_strategy`: steps +- `prediction_loss_only`: True +- `per_device_train_batch_size`: 16 +- `per_device_eval_batch_size`: 16 +- `per_gpu_train_batch_size`: None +- `per_gpu_eval_batch_size`: None +- `gradient_accumulation_steps`: 1 +- `eval_accumulation_steps`: None +- `torch_empty_cache_steps`: None +- `learning_rate`: 1e-05 +- `weight_decay`: 0.02 +- `adam_beta1`: 0.9 +- `adam_beta2`: 0.999 +- `adam_epsilon`: 1e-08 +- `max_grad_norm`: 1.0 +- `num_train_epochs`: 2 +- `max_steps`: 400 +- `lr_scheduler_type`: linear +- `lr_scheduler_kwargs`: {} +- `warmup_ratio`: 0.0 +- `warmup_steps`: 400 +- `log_level`: passive +- `log_level_replica`: warning +- `log_on_each_node`: True +- `logging_nan_inf_filter`: True +- `save_safetensors`: True +- `save_on_each_node`: False +- `save_only_model`: False +- `restore_callback_states_from_checkpoint`: False +- `no_cuda`: False +- `use_cpu`: False +- `use_mps_device`: False +- `seed`: 321 +- `data_seed`: None +- `jit_mode_eval`: False +- `use_ipex`: False +- `bf16`: True +- `fp16`: False +- `fp16_opt_level`: O1 +- `half_precision_backend`: auto +- `bf16_full_eval`: False +- `fp16_full_eval`: False +- `tf32`: True +- `local_rank`: 0 +- `ddp_backend`: None +- `tpu_num_cores`: None +- `tpu_metrics_debug`: False +- `debug`: [] +- `dataloader_drop_last`: False +- `dataloader_num_workers`: 0 +- `dataloader_prefetch_factor`: None +- `past_index`: -1 +- `disable_tqdm`: False +- `remove_unused_columns`: True +- `label_names`: None +- `load_best_model_at_end`: False +- `ignore_data_skip`: False +- `fsdp`: [] +- `fsdp_min_num_params`: 0 +- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False} +- `fsdp_transformer_layer_cls_to_wrap`: None +- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None} +- `deepspeed`: None +- `label_smoothing_factor`: 0.0 +- `optim`: adamw_torch +- `optim_args`: None +- `adafactor`: False +- `group_by_length`: False +- `length_column_name`: length +- `ddp_find_unused_parameters`: None +- `ddp_bucket_cap_mb`: None +- `ddp_broadcast_buffers`: False +- `dataloader_pin_memory`: True +- `dataloader_persistent_workers`: False +- `skip_memory_metrics`: True +- `use_legacy_prediction_loop`: False +- `push_to_hub`: False +- `resume_from_checkpoint`: None +- `hub_model_id`: None +- `hub_strategy`: every_save +- `hub_private_repo`: None +- `hub_always_push`: False +- `hub_revision`: None +- `gradient_checkpointing`: False +- `gradient_checkpointing_kwargs`: None +- `include_inputs_for_metrics`: False +- `include_for_metrics`: [] +- `eval_do_concat_batches`: True +- `fp16_backend`: auto +- `push_to_hub_model_id`: None +- `push_to_hub_organization`: None +- `mp_parameters`: +- `auto_find_batch_size`: False +- `full_determinism`: False +- `torchdynamo`: None +- `ray_scope`: last +- `ddp_timeout`: 1800 +- `torch_compile`: True +- `torch_compile_backend`: inductor +- `torch_compile_mode`: None +- `include_tokens_per_second`: False +- `include_num_input_tokens_seen`: False +- `neftune_noise_alpha`: None +- `optim_target_modules`: None +- `batch_eval_metrics`: False +- `eval_on_start`: False +- `use_liger_kernel`: False +- `liger_kernel_config`: None +- `eval_use_gather_object`: False +- `average_tokens_across_devices`: False +- `prompts`: None +- `batch_sampler`: no_duplicates +- `multi_dataset_batch_sampler`: proportional +- `router_mapping`: {} +- `learning_rate_mapping`: {} + +
+ +### Training Logs +| Epoch | Step | Training Loss | Validation Loss | sts-dev_spearman_cosine | +|:------:|:----:|:-------------:|:---------------:|:-----------------------:| +| -1 | -1 | - | - | 0.4123 | +| 0.0153 | 400 | 4.2505 | 3.9722 | 0.4259 | +| -1 | -1 | - | - | 0.4259 | + + +### Framework Versions +- Python: 3.10.18 +- Sentence Transformers: 5.1.1 +- Transformers: 4.53.2 +- PyTorch: 2.9.0+cu128 +- Accelerate: 1.10.1 +- Datasets: 4.2.0 +- Tokenizers: 0.21.4 + +## Citation + +### BibTeX + +#### Sentence Transformers +```bibtex +@inproceedings{reimers-2019-sentence-bert, + title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", + author = "Reimers, Nils and Gurevych, Iryna", + booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", + month = "11", + year = "2019", + publisher = "Association for Computational Linguistics", + url = "https://arxiv.org/abs/1908.10084", +} +``` + +#### CoSENTLoss +```bibtex +@article{10531646, + author={Huang, Xiang and Peng, Hao and Zou, Dongcheng and Liu, Zhiwei and Li, Jianxin and Liu, Kay and Wu, Jia and Su, Jianlin and Yu, Philip S.}, + journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing}, + title={CoSENT: Consistent Sentence Embedding via Similarity Ranking}, + year={2024}, + doi={10.1109/TASLP.2024.3402087} +} +``` + + + + + + \ No newline at end of file diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/config.json b/victord/sub19/models/multilingual-e5-large_pseudo_full/config.json new file mode 100644 index 0000000000000000000000000000000000000000..65a56f5fb57bf0d23d8ba068cc6bb355f2f41c11 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/config.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "XLMRobertaModel" + ], + "attention_probs_dropout_prob": 0.0, + "bos_token_id": 0, + "classifier_dropout": null, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 514, + "model_type": "xlm-roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/config_sentence_transformers.json b/victord/sub19/models/multilingual-e5-large_pseudo_full/config_sentence_transformers.json new file mode 100644 index 0000000000000000000000000000000000000000..f30018604aa26f0260abd63dc4f3d9878f8da6f5 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "model_type": "SentenceTransformer", + "__version__": { + "sentence_transformers": "5.1.1", + "transformers": "4.53.2", + "pytorch": "2.9.0+cu128" + }, + "prompts": { + "query": "", + "document": "" + }, + "default_prompt_name": null, + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/model.safetensors b/victord/sub19/models/multilingual-e5-large_pseudo_full/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..55e2de11106010e1ecff53f622f1c1a8bdabc0b5 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49f6be8b5989601ed3659c9c2be131d0a077ce0e4e1ac08984768fe310cf8c00 +size 1119826072 diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/modules.json b/victord/sub19/models/multilingual-e5-large_pseudo_full/modules.json new file mode 100644 index 0000000000000000000000000000000000000000..952a9b81c0bfd99800fabf352f69c7ccd46c5e43 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/sentence_bert_config.json b/victord/sub19/models/multilingual-e5-large_pseudo_full/sentence_bert_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4eca68d85ecd3034cf4174d8a4033a75344ea62d --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 512, + "do_lower_case": false +} \ No newline at end of file diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/sentencepiece.bpe.model b/victord/sub19/models/multilingual-e5-large_pseudo_full/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/special_tokens_map.json b/victord/sub19/models/multilingual-e5-large_pseudo_full/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b1879d702821e753ffe4245048eee415d54a9385 --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/tokenizer.json b/victord/sub19/models/multilingual-e5-large_pseudo_full/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..2a51933f1ccb3cf68d53b877cbfa24734ada642f --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085 +size 17082987 diff --git a/victord/sub19/models/multilingual-e5-large_pseudo_full/tokenizer_config.json b/victord/sub19/models/multilingual-e5-large_pseudo_full/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8e54681d5d775f36555ac273ffbe63da3330edfd --- /dev/null +++ b/victord/sub19/models/multilingual-e5-large_pseudo_full/tokenizer_config.json @@ -0,0 +1,56 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "max_length": 512, + "model_max_length": 512, + "pad_token": "", + "sep_token": "", + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "" +} diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/1_Pooling/config.json b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/1_Pooling/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0dfd14b551978a38ff975782a03ffb4cadedc0c7 --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/1_Pooling/config.json @@ -0,0 +1,10 @@ +{ + "word_embedding_dimension": 1024, + "pooling_mode_cls_token": true, + "pooling_mode_mean_tokens": false, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false, + "pooling_mode_weightedmean_tokens": false, + "pooling_mode_lasttoken": false, + "include_prompt": true +} \ No newline at end of file diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/README.md b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..efce2db313fb07148b8299fe289a8aacd66f05bd --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/README.md @@ -0,0 +1,9235 @@ +--- +pipeline_tag: sentence-similarity +tags: +- sentence-transformers +- feature-extraction +- sentence-similarity +- mteb +- arctic +- snowflake-arctic-embed +- transformers.js +license: apache-2.0 +language: +- af +- ar +- az +- be +- bg +- bn +- ca +- ceb +- cs +- cy +- da +- de +- el +- en +- es +- et +- eu +- fa +- fi +- fr +- gl +- gu +- he +- hi +- hr +- ht +- hu +- hy +- id +- is +- it +- ja +- jv +- ka +- kk +- km +- kn +- ko +- ky +- lo +- lt +- lv +- mk +- ml +- mn +- mr +- ms +- my +- ne +- nl +- pa +- pl +- pt +- qu +- ro +- ru +- si +- sk +- sl +- so +- sq +- sr +- sv +- sw +- ta +- te +- th +- tl +- tr +- uk +- ur +- vi +- yo +- zh +model-index: +- name: snowflake-arctic-embed-l-v2.0 + results: + - dataset: + config: en-ext + name: MTEB AmazonCounterfactualClassification (en-ext) + revision: e8379541af4e31359cca9fbcf4b00f2671dba205 + split: test + type: mteb/amazon_counterfactual + metrics: + - type: accuracy + value: 67.039 + - type: f1 + value: 55.1806 + - type: f1_weighted + value: 73.41149999999999 + - type: ap + value: 17.9914 + - type: ap_weighted + value: 17.9914 + - type: main_score + value: 67.039 + task: + type: Classification + - dataset: + config: en + name: MTEB AmazonCounterfactualClassification (en) + revision: e8379541af4e31359cca9fbcf4b00f2671dba205 + split: test + type: mteb/amazon_counterfactual + metrics: + - type: accuracy + value: 65.59700000000001 + - type: f1 + value: 60.244299999999996 + - type: f1_weighted + value: 68.9975 + - type: ap + value: 29.762100000000004 + - type: ap_weighted + value: 29.762100000000004 + - type: main_score + value: 65.59700000000001 + task: + type: Classification + - dataset: + config: default + name: MTEB AmazonPolarityClassification (default) + revision: e2d317d38cd51312af73b3d32a06d1a08b442046 + split: test + type: mteb/amazon_polarity + metrics: + - type: accuracy + value: 74.2565 + - type: f1 + value: 74.0291 + - type: f1_weighted + value: 74.0291 + - type: ap + value: 68.7595 + - type: ap_weighted + value: 68.7595 + - type: main_score + value: 74.2565 + task: + type: Classification + - dataset: + config: en + name: MTEB AmazonReviewsClassification (en) + revision: 1399c76144fd37290681b995c656ef9b2e06e26d + split: test + type: mteb/amazon_reviews_multi + metrics: + - type: accuracy + value: 34.946 + - type: f1 + value: 34.2853 + - type: f1_weighted + value: 34.2853 + - type: main_score + value: 34.946 + task: + type: Classification + - dataset: + config: default + name: MTEB ArguAna (default) + revision: c22ab2a51041ffd869aaddef7af8d8215647e41a + split: test + type: mteb/arguana + metrics: + - type: ndcg_at_1 + value: 33.286 + - type: ndcg_at_3 + value: 49.051 + - type: ndcg_at_5 + value: 54.107000000000006 + - type: ndcg_at_10 + value: 59.146 + - type: ndcg_at_20 + value: 60.897999999999996 + - type: ndcg_at_100 + value: 61.78399999999999 + - type: ndcg_at_1000 + value: 61.845000000000006 + - type: map_at_1 + value: 33.286 + - type: map_at_3 + value: 45.14 + - type: map_at_5 + value: 47.939 + - type: map_at_10 + value: 50.046 + - type: map_at_20 + value: 50.56 + - type: map_at_100 + value: 50.708 + - type: map_at_1000 + value: 50.712 + - type: recall_at_1 + value: 33.286 + - type: recall_at_3 + value: 60.38400000000001 + - type: recall_at_5 + value: 72.688 + - type: recall_at_10 + value: 88.122 + - type: recall_at_20 + value: 94.808 + - type: recall_at_100 + value: 99.21799999999999 + - type: recall_at_1000 + value: 99.644 + - type: precision_at_1 + value: 33.286 + - type: precision_at_3 + value: 20.128 + - type: precision_at_5 + value: 14.538 + - type: precision_at_10 + value: 8.812000000000001 + - type: precision_at_20 + value: 4.74 + - type: precision_at_100 + value: 0.992 + - type: precision_at_1000 + value: 0.1 + - type: mrr_at_1 + value: 33.926 + - type: mrr_at_3 + value: 45.3414 + - type: mrr_at_5 + value: 48.1828 + - type: mrr_at_10 + value: 50.270700000000005 + - type: mrr_at_20 + value: 50.7844 + - type: mrr_at_100 + value: 50.9259 + - type: mrr_at_1000 + value: 50.9294 + - type: nauc_ndcg_at_1_max + value: -10.305 + - type: nauc_ndcg_at_1_std + value: -15.674199999999999 + - type: nauc_ndcg_at_1_diff1 + value: 18.6355 + - type: nauc_ndcg_at_3_max + value: -7.744 + - type: nauc_ndcg_at_3_std + value: -16.894000000000002 + - type: nauc_ndcg_at_3_diff1 + value: 15.4469 + - type: nauc_ndcg_at_5_max + value: -6.4887 + - type: nauc_ndcg_at_5_std + value: -16.1382 + - type: nauc_ndcg_at_5_diff1 + value: 13.8214 + - type: nauc_ndcg_at_10_max + value: -7.616499999999999 + - type: nauc_ndcg_at_10_std + value: -15.8073 + - type: nauc_ndcg_at_10_diff1 + value: 13.7678 + - type: nauc_ndcg_at_20_max + value: -6.9801 + - type: nauc_ndcg_at_20_std + value: -15.068699999999998 + - type: nauc_ndcg_at_20_diff1 + value: 14.2013 + - type: nauc_ndcg_at_100_max + value: -7.5221 + - type: nauc_ndcg_at_100_std + value: -15.417200000000001 + - type: nauc_ndcg_at_100_diff1 + value: 15.1072 + - type: nauc_ndcg_at_1000_max + value: -7.6931 + - type: nauc_ndcg_at_1000_std + value: -15.5367 + - type: nauc_ndcg_at_1000_diff1 + value: 15.001700000000001 + - type: nauc_map_at_1_max + value: -10.305 + - type: nauc_map_at_1_std + value: -15.674199999999999 + - type: nauc_map_at_1_diff1 + value: 18.6355 + - type: nauc_map_at_3_max + value: -8.4505 + - type: nauc_map_at_3_std + value: -16.5487 + - type: nauc_map_at_3_diff1 + value: 15.965599999999998 + - type: nauc_map_at_5_max + value: -7.8429 + - type: nauc_map_at_5_std + value: -16.1332 + - type: nauc_map_at_5_diff1 + value: 15.0893 + - type: nauc_map_at_10_max + value: -8.3186 + - type: nauc_map_at_10_std + value: -15.979399999999998 + - type: nauc_map_at_10_diff1 + value: 15.136199999999999 + - type: nauc_map_at_20_max + value: -8.1697 + - type: nauc_map_at_20_std + value: -15.8241 + - type: nauc_map_at_20_diff1 + value: 15.260599999999998 + - type: nauc_map_at_100_max + value: -8.2285 + - type: nauc_map_at_100_std + value: -15.8624 + - type: nauc_map_at_100_diff1 + value: 15.412600000000001 + - type: nauc_map_at_1000_max + value: -8.2359 + - type: nauc_map_at_1000_std + value: -15.867 + - type: nauc_map_at_1000_diff1 + value: 15.408 + - type: nauc_recall_at_1_max + value: -10.305 + - type: nauc_recall_at_1_std + value: -15.674199999999999 + - type: nauc_recall_at_1_diff1 + value: 18.6355 + - type: nauc_recall_at_3_max + value: -5.5097 + - type: nauc_recall_at_3_std + value: -17.9896 + - type: nauc_recall_at_3_diff1 + value: 13.9525 + - type: nauc_recall_at_5_max + value: -0.9383 + - type: nauc_recall_at_5_std + value: -16.035 + - type: nauc_recall_at_5_diff1 + value: 8.8431 + - type: nauc_recall_at_10_max + value: -2.8548 + - type: nauc_recall_at_10_std + value: -14.1203 + - type: nauc_recall_at_10_diff1 + value: 3.2265 + - type: nauc_recall_at_20_max + value: 14.2043 + - type: nauc_recall_at_20_std + value: 2.1298999999999997 + - type: nauc_recall_at_20_diff1 + value: -1.9900000000000002 + - type: nauc_recall_at_100_max + value: 44.0173 + - type: nauc_recall_at_100_std + value: 42.131800000000005 + - type: nauc_recall_at_100_diff1 + value: 29.9983 + - type: nauc_recall_at_1000_max + value: 25.9434 + - type: nauc_recall_at_1000_std + value: 53.9252 + - type: nauc_recall_at_1000_diff1 + value: -0.9778 + - type: nauc_precision_at_1_max + value: -10.305 + - type: nauc_precision_at_1_std + value: -15.674199999999999 + - type: nauc_precision_at_1_diff1 + value: 18.6355 + - type: nauc_precision_at_3_max + value: -5.5097 + - type: nauc_precision_at_3_std + value: -17.9896 + - type: nauc_precision_at_3_diff1 + value: 13.9525 + - type: nauc_precision_at_5_max + value: -0.9383 + - type: nauc_precision_at_5_std + value: -16.035 + - type: nauc_precision_at_5_diff1 + value: 8.8431 + - type: nauc_precision_at_10_max + value: -2.8548 + - type: nauc_precision_at_10_std + value: -14.1203 + - type: nauc_precision_at_10_diff1 + value: 3.2265 + - type: nauc_precision_at_20_max + value: 14.2043 + - type: nauc_precision_at_20_std + value: 2.1298999999999997 + - type: nauc_precision_at_20_diff1 + value: -1.9900000000000002 + - type: nauc_precision_at_100_max + value: 44.0173 + - type: nauc_precision_at_100_std + value: 42.131800000000005 + - type: nauc_precision_at_100_diff1 + value: 29.9983 + - type: nauc_precision_at_1000_max + value: 25.9434 + - type: nauc_precision_at_1000_std + value: 53.9252 + - type: nauc_precision_at_1000_diff1 + value: -0.9778 + - type: nauc_mrr_at_1_max + value: -9.833 + - type: nauc_mrr_at_1_std + value: -14.8351 + - type: nauc_mrr_at_1_diff1 + value: 16.7604 + - type: nauc_mrr_at_3_max + value: -9.0116 + - type: nauc_mrr_at_3_std + value: -16.296 + - type: nauc_mrr_at_3_diff1 + value: 14.178199999999999 + - type: nauc_mrr_at_5_max + value: -8.308300000000001 + - type: nauc_mrr_at_5_std + value: -15.751999999999999 + - type: nauc_mrr_at_5_diff1 + value: 13.306299999999998 + - type: nauc_mrr_at_10_max + value: -8.7962 + - type: nauc_mrr_at_10_std + value: -15.688099999999999 + - type: nauc_mrr_at_10_diff1 + value: 13.2589 + - type: nauc_mrr_at_20_max + value: -8.6773 + - type: nauc_mrr_at_20_std + value: -15.479499999999998 + - type: nauc_mrr_at_20_diff1 + value: 13.354 + - type: nauc_mrr_at_100_max + value: -8.7533 + - type: nauc_mrr_at_100_std + value: -15.553600000000001 + - type: nauc_mrr_at_100_diff1 + value: 13.4796 + - type: nauc_mrr_at_1000_max + value: -8.7608 + - type: nauc_mrr_at_1000_std + value: -15.5582 + - type: nauc_mrr_at_1000_diff1 + value: 13.4748 + - type: main_score + value: 59.146 + task: + type: Retrieval + - dataset: + config: default + name: MTEB ArxivClusteringP2P (default) + revision: a122ad7f3f0291bf49cc6f4d32aa80929df69d5d + split: test + type: mteb/arxiv-clustering-p2p + metrics: + - type: v_measure + value: 43.9715 + - type: v_measure_std + value: 13.4325 + - type: main_score + value: 43.9715 + task: + type: Clustering + - dataset: + config: default + name: MTEB ArxivClusteringS2S (default) + revision: f910caf1a6075f7329cdf8c1a6135696f37dbd53 + split: test + type: mteb/arxiv-clustering-s2s + metrics: + - type: v_measure + value: 34.775800000000004 + - type: v_measure_std + value: 13.922799999999999 + - type: main_score + value: 34.775800000000004 + task: + type: Clustering + - dataset: + config: default + name: MTEB AskUbuntuDupQuestions (default) + revision: 2000358ca161889fa9c082cb41daa8dcfb161a54 + split: test + type: mteb/askubuntudupquestions-reranking + metrics: + - type: map + value: 63.3521 + - type: mrr + value: 77.5965 + - type: nAUC_map_max + value: 21.2353 + - type: nAUC_map_std + value: 17.002100000000002 + - type: nAUC_map_diff1 + value: 3.8135000000000003 + - type: nAUC_mrr_max + value: 35.058299999999996 + - type: nAUC_mrr_std + value: 20.432 + - type: nAUC_mrr_diff1 + value: 9.2584 + - type: main_score + value: 63.3521 + task: + type: Reranking + - dataset: + config: default + name: MTEB BIOSSES (default) + revision: d3fb88f8f02e40887cd149695127462bbcf29b4a + split: test + type: mteb/biosses-sts + metrics: + - type: pearson + value: 89.8072 + - type: spearman + value: 87.2875 + - type: cosine_pearson + value: 89.8072 + - type: cosine_spearman + value: 87.2875 + - type: manhattan_pearson + value: 87.9173 + - type: manhattan_spearman + value: 86.7327 + - type: euclidean_pearson + value: 88.21600000000001 + - type: euclidean_spearman + value: 87.2875 + - type: main_score + value: 87.2875 + task: + type: STS + - dataset: + config: default + name: MTEB Banking77Classification (default) + revision: 0fd18e25b25c072e09e0d92ab615fda904d66300 + split: test + type: mteb/banking77 + metrics: + - type: accuracy + value: 81.8149 + - type: f1 + value: 81.2226 + - type: f1_weighted + value: 81.2226 + - type: main_score + value: 81.8149 + task: + type: Classification + - dataset: + config: default + name: MTEB BiorxivClusteringP2P (default) + revision: 65b79d1d13f80053f67aca9498d9402c2d9f1f40 + split: test + type: mteb/biorxiv-clustering-p2p + metrics: + - type: v_measure + value: 35.0927 + - type: v_measure_std + value: 0.7048 + - type: main_score + value: 35.0927 + task: + type: Clustering + - dataset: + config: default + name: MTEB BiorxivClusteringS2S (default) + revision: 258694dd0231531bc1fd9de6ceb52a0853c6d908 + split: test + type: mteb/biorxiv-clustering-s2s + metrics: + - type: v_measure + value: 30.220999999999997 + - type: v_measure_std + value: 1.107 + - type: main_score + value: 30.220999999999997 + task: + type: Clustering + - dataset: + config: default + name: MTEB CQADupstackAndroidRetrieval (default) + revision: f46a197baaae43b4f621051089b82a364682dfeb + split: test + type: mteb/cqadupstack-android + metrics: + - type: ndcg_at_1 + value: 44.349 + - type: ndcg_at_3 + value: 50.109 + - type: ndcg_at_5 + value: 52.88699999999999 + - type: ndcg_at_10 + value: 55.799 + - type: ndcg_at_20 + value: 57.589999999999996 + - type: ndcg_at_100 + value: 60.539 + - type: ndcg_at_1000 + value: 61.897000000000006 + - type: map_at_1 + value: 36.230000000000004 + - type: map_at_3 + value: 44.929 + - type: map_at_5 + value: 47.191 + - type: map_at_10 + value: 48.88 + - type: map_at_20 + value: 49.685 + - type: map_at_100 + value: 50.327 + - type: map_at_1000 + value: 50.431000000000004 + - type: recall_at_1 + value: 36.230000000000004 + - type: recall_at_3 + value: 53.173 + - type: recall_at_5 + value: 60.35 + - type: recall_at_10 + value: 69.07 + - type: recall_at_20 + value: 75.371 + - type: recall_at_100 + value: 88.736 + - type: recall_at_1000 + value: 96.75399999999999 + - type: precision_at_1 + value: 44.349 + - type: precision_at_3 + value: 23.748 + - type: precision_at_5 + value: 17.368 + - type: precision_at_10 + value: 10.629 + - type: precision_at_20 + value: 6.152 + - type: precision_at_100 + value: 1.6150000000000002 + - type: precision_at_1000 + value: 0.201 + - type: mrr_at_1 + value: 44.3491 + - type: mrr_at_3 + value: 52.0744 + - type: mrr_at_5 + value: 53.9628 + - type: mrr_at_10 + value: 54.9072 + - type: mrr_at_20 + value: 55.19539999999999 + - type: mrr_at_100 + value: 55.4537 + - type: mrr_at_1000 + value: 55.4787 + - type: nauc_ndcg_at_1_max + value: 36.404599999999995 + - type: nauc_ndcg_at_1_std + value: -4.5556 + - type: nauc_ndcg_at_1_diff1 + value: 57.4025 + - type: nauc_ndcg_at_3_max + value: 38.0347 + - type: nauc_ndcg_at_3_std + value: -2.2339 + - type: nauc_ndcg_at_3_diff1 + value: 50.9146 + - type: nauc_ndcg_at_5_max + value: 38.2927 + - type: nauc_ndcg_at_5_std + value: -2.3645 + - type: nauc_ndcg_at_5_diff1 + value: 51.638 + - type: nauc_ndcg_at_10_max + value: 38.4619 + - type: nauc_ndcg_at_10_std + value: -2.8955 + - type: nauc_ndcg_at_10_diff1 + value: 51.35849999999999 + - type: nauc_ndcg_at_20_max + value: 38.2122 + - type: nauc_ndcg_at_20_std + value: -1.9339 + - type: nauc_ndcg_at_20_diff1 + value: 50.4981 + - type: nauc_ndcg_at_100_max + value: 39.380900000000004 + - type: nauc_ndcg_at_100_std + value: -0.21889999999999998 + - type: nauc_ndcg_at_100_diff1 + value: 51.5696 + - type: nauc_ndcg_at_1000_max + value: 38.9069 + - type: nauc_ndcg_at_1000_std + value: -0.8251 + - type: nauc_ndcg_at_1000_diff1 + value: 51.605500000000006 + - type: nauc_map_at_1_max + value: 31.694 + - type: nauc_map_at_1_std + value: -4.2857 + - type: nauc_map_at_1_diff1 + value: 57.991400000000006 + - type: nauc_map_at_3_max + value: 36.115399999999994 + - type: nauc_map_at_3_std + value: -3.9859999999999998 + - type: nauc_map_at_3_diff1 + value: 52.394 + - type: nauc_map_at_5_max + value: 36.896499999999996 + - type: nauc_map_at_5_std + value: -3.6282 + - type: nauc_map_at_5_diff1 + value: 52.7023 + - type: nauc_map_at_10_max + value: 37.2695 + - type: nauc_map_at_10_std + value: -3.7142 + - type: nauc_map_at_10_diff1 + value: 52.6081 + - type: nauc_map_at_20_max + value: 37.4097 + - type: nauc_map_at_20_std + value: -3.0479 + - type: nauc_map_at_20_diff1 + value: 52.2999 + - type: nauc_map_at_100_max + value: 37.6608 + - type: nauc_map_at_100_std + value: -2.7363999999999997 + - type: nauc_map_at_100_diff1 + value: 52.5068 + - type: nauc_map_at_1000_max + value: 37.6406 + - type: nauc_map_at_1000_std + value: -2.7695000000000003 + - type: nauc_map_at_1000_diff1 + value: 52.5091 + - type: nauc_recall_at_1_max + value: 31.694 + - type: nauc_recall_at_1_std + value: -4.2857 + - type: nauc_recall_at_1_diff1 + value: 57.991400000000006 + - type: nauc_recall_at_3_max + value: 35.9705 + - type: nauc_recall_at_3_std + value: -2.78 + - type: nauc_recall_at_3_diff1 + value: 44.2342 + - type: nauc_recall_at_5_max + value: 36.3608 + - type: nauc_recall_at_5_std + value: -1.8541999999999998 + - type: nauc_recall_at_5_diff1 + value: 45.0955 + - type: nauc_recall_at_10_max + value: 35.7364 + - type: nauc_recall_at_10_std + value: -3.2479 + - type: nauc_recall_at_10_diff1 + value: 42.3031 + - type: nauc_recall_at_20_max + value: 34.7814 + - type: nauc_recall_at_20_std + value: 0.7642 + - type: nauc_recall_at_20_diff1 + value: 37.3357 + - type: nauc_recall_at_100_max + value: 49.1721 + - type: nauc_recall_at_100_std + value: 27.8334 + - type: nauc_recall_at_100_diff1 + value: 39.549 + - type: nauc_recall_at_1000_max + value: 59.516400000000004 + - type: nauc_recall_at_1000_std + value: 66.1089 + - type: nauc_recall_at_1000_diff1 + value: 31.4818 + - type: nauc_precision_at_1_max + value: 36.404599999999995 + - type: nauc_precision_at_1_std + value: -4.5556 + - type: nauc_precision_at_1_diff1 + value: 57.4025 + - type: nauc_precision_at_3_max + value: 35.7954 + - type: nauc_precision_at_3_std + value: 0.6122 + - type: nauc_precision_at_3_diff1 + value: 29.4346 + - type: nauc_precision_at_5_max + value: 31.322699999999998 + - type: nauc_precision_at_5_std + value: 2.2124 + - type: nauc_precision_at_5_diff1 + value: 21.1992 + - type: nauc_precision_at_10_max + value: 22.6897 + - type: nauc_precision_at_10_std + value: 3.6117999999999997 + - type: nauc_precision_at_10_diff1 + value: 9.0833 + - type: nauc_precision_at_20_max + value: 14.954799999999999 + - type: nauc_precision_at_20_std + value: 7.2373 + - type: nauc_precision_at_20_diff1 + value: -0.544 + - type: nauc_precision_at_100_max + value: 4.2428 + - type: nauc_precision_at_100_std + value: 7.3461 + - type: nauc_precision_at_100_diff1 + value: -11.3684 + - type: nauc_precision_at_1000_max + value: -9.148399999999999 + - type: nauc_precision_at_1000_std + value: -3.5724 + - type: nauc_precision_at_1000_diff1 + value: -19.142400000000002 + - type: nauc_mrr_at_1_max + value: 36.404599999999995 + - type: nauc_mrr_at_1_std + value: -4.5556 + - type: nauc_mrr_at_1_diff1 + value: 57.4025 + - type: nauc_mrr_at_3_max + value: 38.7222 + - type: nauc_mrr_at_3_std + value: -2.3924000000000003 + - type: nauc_mrr_at_3_diff1 + value: 52.7995 + - type: nauc_mrr_at_5_max + value: 38.7579 + - type: nauc_mrr_at_5_std + value: -2.6441 + - type: nauc_mrr_at_5_diff1 + value: 53.547599999999996 + - type: nauc_mrr_at_10_max + value: 38.7832 + - type: nauc_mrr_at_10_std + value: -2.5202999999999998 + - type: nauc_mrr_at_10_diff1 + value: 53.4856 + - type: nauc_mrr_at_20_max + value: 38.6588 + - type: nauc_mrr_at_20_std + value: -2.501 + - type: nauc_mrr_at_20_diff1 + value: 53.3571 + - type: nauc_mrr_at_100_max + value: 38.6456 + - type: nauc_mrr_at_100_std + value: -2.4756 + - type: nauc_mrr_at_100_diff1 + value: 53.455600000000004 + - type: nauc_mrr_at_1000_max + value: 38.6449 + - type: nauc_mrr_at_1000_std + value: -2.4623 + - type: nauc_mrr_at_1000_diff1 + value: 53.45419999999999 + - type: main_score + value: 55.799 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackEnglishRetrieval (default) + revision: ad9991cb51e31e31e430383c75ffb2885547b5f0 + split: test + type: mteb/cqadupstack-english + metrics: + - type: ndcg_at_1 + value: 44.204 + - type: ndcg_at_3 + value: 49.549 + - type: ndcg_at_5 + value: 51.658 + - type: ndcg_at_10 + value: 53.681 + - type: ndcg_at_20 + value: 55.129 + - type: ndcg_at_100 + value: 57.691 + - type: ndcg_at_1000 + value: 59.325 + - type: map_at_1 + value: 35.193000000000005 + - type: map_at_3 + value: 44.005 + - type: map_at_5 + value: 46.043 + - type: map_at_10 + value: 47.491 + - type: map_at_20 + value: 48.169000000000004 + - type: map_at_100 + value: 48.789 + - type: map_at_1000 + value: 48.898 + - type: recall_at_1 + value: 35.193000000000005 + - type: recall_at_3 + value: 51.333 + - type: recall_at_5 + value: 57.436 + - type: recall_at_10 + value: 63.991 + - type: recall_at_20 + value: 69.37100000000001 + - type: recall_at_100 + value: 81.099 + - type: recall_at_1000 + value: 91.363 + - type: precision_at_1 + value: 44.204 + - type: precision_at_3 + value: 24.374000000000002 + - type: precision_at_5 + value: 17.287 + - type: precision_at_10 + value: 10.293 + - type: precision_at_20 + value: 5.943 + - type: precision_at_100 + value: 1.5730000000000002 + - type: precision_at_1000 + value: 0.197 + - type: mrr_at_1 + value: 44.2038 + - type: mrr_at_3 + value: 51.624199999999995 + - type: mrr_at_5 + value: 52.9459 + - type: mrr_at_10 + value: 53.697399999999995 + - type: mrr_at_20 + value: 54.028200000000005 + - type: mrr_at_100 + value: 54.267900000000004 + - type: mrr_at_1000 + value: 54.3028 + - type: nauc_ndcg_at_1_max + value: 45.3525 + - type: nauc_ndcg_at_1_std + value: -2.2124 + - type: nauc_ndcg_at_1_diff1 + value: 59.392100000000006 + - type: nauc_ndcg_at_3_max + value: 46.6258 + - type: nauc_ndcg_at_3_std + value: -2.8042000000000002 + - type: nauc_ndcg_at_3_diff1 + value: 55.0995 + - type: nauc_ndcg_at_5_max + value: 47.3391 + - type: nauc_ndcg_at_5_std + value: -1.8336999999999999 + - type: nauc_ndcg_at_5_diff1 + value: 54.848 + - type: nauc_ndcg_at_10_max + value: 47.713899999999995 + - type: nauc_ndcg_at_10_std + value: -0.6185 + - type: nauc_ndcg_at_10_diff1 + value: 54.6241 + - type: nauc_ndcg_at_20_max + value: 48.072900000000004 + - type: nauc_ndcg_at_20_std + value: -0.21589999999999998 + - type: nauc_ndcg_at_20_diff1 + value: 54.655100000000004 + - type: nauc_ndcg_at_100_max + value: 48.4791 + - type: nauc_ndcg_at_100_std + value: 1.9865000000000002 + - type: nauc_ndcg_at_100_diff1 + value: 54.033 + - type: nauc_ndcg_at_1000_max + value: 48.3686 + - type: nauc_ndcg_at_1000_std + value: 1.8716 + - type: nauc_ndcg_at_1000_diff1 + value: 54.125 + - type: nauc_map_at_1_max + value: 34.797200000000004 + - type: nauc_map_at_1_std + value: -13.140199999999998 + - type: nauc_map_at_1_diff1 + value: 61.197100000000006 + - type: nauc_map_at_3_max + value: 41.4347 + - type: nauc_map_at_3_std + value: -10.0816 + - type: nauc_map_at_3_diff1 + value: 57.8979 + - type: nauc_map_at_5_max + value: 43.1536 + - type: nauc_map_at_5_std + value: -7.8041 + - type: nauc_map_at_5_diff1 + value: 57.1125 + - type: nauc_map_at_10_max + value: 44.243700000000004 + - type: nauc_map_at_10_std + value: -6.047000000000001 + - type: nauc_map_at_10_diff1 + value: 56.688700000000004 + - type: nauc_map_at_20_max + value: 44.7799 + - type: nauc_map_at_20_std + value: -5.2916 + - type: nauc_map_at_20_diff1 + value: 56.565799999999996 + - type: nauc_map_at_100_max + value: 45.3233 + - type: nauc_map_at_100_std + value: -4.287 + - type: nauc_map_at_100_diff1 + value: 56.41460000000001 + - type: nauc_map_at_1000_max + value: 45.3992 + - type: nauc_map_at_1000_std + value: -4.1593 + - type: nauc_map_at_1000_diff1 + value: 56.413599999999995 + - type: nauc_recall_at_1_max + value: 34.797200000000004 + - type: nauc_recall_at_1_std + value: -13.140199999999998 + - type: nauc_recall_at_1_diff1 + value: 61.197100000000006 + - type: nauc_recall_at_3_max + value: 42.7264 + - type: nauc_recall_at_3_std + value: -8.201799999999999 + - type: nauc_recall_at_3_diff1 + value: 52.3494 + - type: nauc_recall_at_5_max + value: 44.6494 + - type: nauc_recall_at_5_std + value: -3.3112999999999997 + - type: nauc_recall_at_5_diff1 + value: 50.1019 + - type: nauc_recall_at_10_max + value: 46.6669 + - type: nauc_recall_at_10_std + value: 2.3359 + - type: nauc_recall_at_10_diff1 + value: 48.1454 + - type: nauc_recall_at_20_max + value: 48.7828 + - type: nauc_recall_at_20_std + value: 6.0266 + - type: nauc_recall_at_20_diff1 + value: 46.786699999999996 + - type: nauc_recall_at_100_max + value: 53.081999999999994 + - type: nauc_recall_at_100_std + value: 24.1569 + - type: nauc_recall_at_100_diff1 + value: 40.4049 + - type: nauc_recall_at_1000_max + value: 55.803000000000004 + - type: nauc_recall_at_1000_std + value: 36.3769 + - type: nauc_recall_at_1000_diff1 + value: 34.336 + - type: nauc_precision_at_1_max + value: 45.3525 + - type: nauc_precision_at_1_std + value: -2.2124 + - type: nauc_precision_at_1_diff1 + value: 59.392100000000006 + - type: nauc_precision_at_3_max + value: 44.2838 + - type: nauc_precision_at_3_std + value: 14.3908 + - type: nauc_precision_at_3_diff1 + value: 27.219700000000003 + - type: nauc_precision_at_5_max + value: 42.9914 + - type: nauc_precision_at_5_std + value: 23.0682 + - type: nauc_precision_at_5_diff1 + value: 16.2263 + - type: nauc_precision_at_10_max + value: 38.5042 + - type: nauc_precision_at_10_std + value: 30.792199999999998 + - type: nauc_precision_at_10_diff1 + value: 5.7691 + - type: nauc_precision_at_20_max + value: 34.417500000000004 + - type: nauc_precision_at_20_std + value: 34.1749 + - type: nauc_precision_at_20_diff1 + value: -0.9022 + - type: nauc_precision_at_100_max + value: 27.4072 + - type: nauc_precision_at_100_std + value: 42.4351 + - type: nauc_precision_at_100_diff1 + value: -11.407 + - type: nauc_precision_at_1000_max + value: 16.142400000000002 + - type: nauc_precision_at_1000_std + value: 36.4482 + - type: nauc_precision_at_1000_diff1 + value: -16.8073 + - type: nauc_mrr_at_1_max + value: 45.3525 + - type: nauc_mrr_at_1_std + value: -2.2124 + - type: nauc_mrr_at_1_diff1 + value: 59.392100000000006 + - type: nauc_mrr_at_3_max + value: 48.7407 + - type: nauc_mrr_at_3_std + value: 0.2074 + - type: nauc_mrr_at_3_diff1 + value: 55.8153 + - type: nauc_mrr_at_5_max + value: 48.9081 + - type: nauc_mrr_at_5_std + value: 0.9781 + - type: nauc_mrr_at_5_diff1 + value: 55.6807 + - type: nauc_mrr_at_10_max + value: 48.7888 + - type: nauc_mrr_at_10_std + value: 1.384 + - type: nauc_mrr_at_10_diff1 + value: 55.5207 + - type: nauc_mrr_at_20_max + value: 48.7371 + - type: nauc_mrr_at_20_std + value: 1.3671 + - type: nauc_mrr_at_20_diff1 + value: 55.508199999999995 + - type: nauc_mrr_at_100_max + value: 48.7472 + - type: nauc_mrr_at_100_std + value: 1.5221 + - type: nauc_mrr_at_100_diff1 + value: 55.5036 + - type: nauc_mrr_at_1000_max + value: 48.7402 + - type: nauc_mrr_at_1000_std + value: 1.5072 + - type: nauc_mrr_at_1000_diff1 + value: 55.507 + - type: main_score + value: 53.681 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackGamingRetrieval (default) + revision: 4885aa143210c98657558c04aaf3dc47cfb54340 + split: test + type: mteb/cqadupstack-gaming + metrics: + - type: ndcg_at_1 + value: 50.345 + - type: ndcg_at_3 + value: 57.776 + - type: ndcg_at_5 + value: 60.477000000000004 + - type: ndcg_at_10 + value: 63.172 + - type: ndcg_at_20 + value: 64.62 + - type: ndcg_at_100 + value: 66.538 + - type: ndcg_at_1000 + value: 67.43 + - type: map_at_1 + value: 44.153 + - type: map_at_3 + value: 53.979 + - type: map_at_5 + value: 55.925000000000004 + - type: map_at_10 + value: 57.32899999999999 + - type: map_at_20 + value: 57.879000000000005 + - type: map_at_100 + value: 58.239 + - type: map_at_1000 + value: 58.285 + - type: recall_at_1 + value: 44.153 + - type: recall_at_3 + value: 62.766999999999996 + - type: recall_at_5 + value: 69.405 + - type: recall_at_10 + value: 77.107 + - type: recall_at_20 + value: 82.337 + - type: recall_at_100 + value: 91.307 + - type: recall_at_1000 + value: 97.586 + - type: precision_at_1 + value: 50.345 + - type: precision_at_3 + value: 25.601000000000003 + - type: precision_at_5 + value: 17.416999999999998 + - type: precision_at_10 + value: 9.994 + - type: precision_at_20 + value: 5.492 + - type: precision_at_100 + value: 1.261 + - type: precision_at_1000 + value: 0.13799999999999998 + - type: mrr_at_1 + value: 50.3448 + - type: mrr_at_3 + value: 58.160900000000005 + - type: mrr_at_5 + value: 59.549600000000005 + - type: mrr_at_10 + value: 60.545899999999996 + - type: mrr_at_20 + value: 60.8453 + - type: mrr_at_100 + value: 61.06120000000001 + - type: mrr_at_1000 + value: 61.083299999999994 + - type: nauc_ndcg_at_1_max + value: 39.467400000000005 + - type: nauc_ndcg_at_1_std + value: -6.512 + - type: nauc_ndcg_at_1_diff1 + value: 57.337700000000005 + - type: nauc_ndcg_at_3_max + value: 42.8884 + - type: nauc_ndcg_at_3_std + value: -6.0156 + - type: nauc_ndcg_at_3_diff1 + value: 54.432 + - type: nauc_ndcg_at_5_max + value: 44.831500000000005 + - type: nauc_ndcg_at_5_std + value: -4.3286999999999995 + - type: nauc_ndcg_at_5_diff1 + value: 54.6971 + - type: nauc_ndcg_at_10_max + value: 44.391799999999996 + - type: nauc_ndcg_at_10_std + value: -3.6792 + - type: nauc_ndcg_at_10_diff1 + value: 53.749199999999995 + - type: nauc_ndcg_at_20_max + value: 44.9459 + - type: nauc_ndcg_at_20_std + value: -2.1965 + - type: nauc_ndcg_at_20_diff1 + value: 53.7261 + - type: nauc_ndcg_at_100_max + value: 45.0603 + - type: nauc_ndcg_at_100_std + value: -1.1026 + - type: nauc_ndcg_at_100_diff1 + value: 54.059900000000006 + - type: nauc_ndcg_at_1000_max + value: 44.9294 + - type: nauc_ndcg_at_1000_std + value: -1.7629 + - type: nauc_ndcg_at_1000_diff1 + value: 54.57189999999999 + - type: nauc_map_at_1_max + value: 34.3031 + - type: nauc_map_at_1_std + value: -8.9637 + - type: nauc_map_at_1_diff1 + value: 57.99100000000001 + - type: nauc_map_at_3_max + value: 40.732 + - type: nauc_map_at_3_std + value: -8.312999999999999 + - type: nauc_map_at_3_diff1 + value: 55.9106 + - type: nauc_map_at_5_max + value: 42.1709 + - type: nauc_map_at_5_std + value: -6.9354 + - type: nauc_map_at_5_diff1 + value: 56.042899999999996 + - type: nauc_map_at_10_max + value: 42.1589 + - type: nauc_map_at_10_std + value: -6.3601 + - type: nauc_map_at_10_diff1 + value: 55.490700000000004 + - type: nauc_map_at_20_max + value: 42.595 + - type: nauc_map_at_20_std + value: -5.5588 + - type: nauc_map_at_20_diff1 + value: 55.4651 + - type: nauc_map_at_100_max + value: 42.6911 + - type: nauc_map_at_100_std + value: -5.2459999999999996 + - type: nauc_map_at_100_diff1 + value: 55.45060000000001 + - type: nauc_map_at_1000_max + value: 42.7134 + - type: nauc_map_at_1000_std + value: -5.2317 + - type: nauc_map_at_1000_diff1 + value: 55.4871 + - type: nauc_recall_at_1_max + value: 34.3031 + - type: nauc_recall_at_1_std + value: -8.9637 + - type: nauc_recall_at_1_diff1 + value: 57.99100000000001 + - type: nauc_recall_at_3_max + value: 43.623400000000004 + - type: nauc_recall_at_3_std + value: -6.2843 + - type: nauc_recall_at_3_diff1 + value: 50.775800000000004 + - type: nauc_recall_at_5_max + value: 48.7222 + - type: nauc_recall_at_5_std + value: -0.9506000000000001 + - type: nauc_recall_at_5_diff1 + value: 50.41480000000001 + - type: nauc_recall_at_10_max + value: 47.6178 + - type: nauc_recall_at_10_std + value: 2.2783 + - type: nauc_recall_at_10_diff1 + value: 45.1663 + - type: nauc_recall_at_20_max + value: 51.454 + - type: nauc_recall_at_20_std + value: 11.8339 + - type: nauc_recall_at_20_diff1 + value: 42.8694 + - type: nauc_recall_at_100_max + value: 58.145500000000006 + - type: nauc_recall_at_100_std + value: 35.4717 + - type: nauc_recall_at_100_diff1 + value: 40.8401 + - type: nauc_recall_at_1000_max + value: 79.9122 + - type: nauc_recall_at_1000_std + value: 64.5076 + - type: nauc_recall_at_1000_diff1 + value: 48.7357 + - type: nauc_precision_at_1_max + value: 39.467400000000005 + - type: nauc_precision_at_1_std + value: -6.512 + - type: nauc_precision_at_1_diff1 + value: 57.337700000000005 + - type: nauc_precision_at_3_max + value: 39.763799999999996 + - type: nauc_precision_at_3_std + value: 2.8881 + - type: nauc_precision_at_3_diff1 + value: 30.5735 + - type: nauc_precision_at_5_max + value: 38.062200000000004 + - type: nauc_precision_at_5_std + value: 10.2952 + - type: nauc_precision_at_5_diff1 + value: 21.2531 + - type: nauc_precision_at_10_max + value: 31.330099999999998 + - type: nauc_precision_at_10_std + value: 16.6561 + - type: nauc_precision_at_10_diff1 + value: 8.4745 + - type: nauc_precision_at_20_max + value: 28.5499 + - type: nauc_precision_at_20_std + value: 25.593300000000003 + - type: nauc_precision_at_20_diff1 + value: 0.8708 + - type: nauc_precision_at_100_max + value: 20.275299999999998 + - type: nauc_precision_at_100_std + value: 31.6878 + - type: nauc_precision_at_100_diff1 + value: -8.8113 + - type: nauc_precision_at_1000_max + value: 15.4133 + - type: nauc_precision_at_1000_std + value: 29.5211 + - type: nauc_precision_at_1000_diff1 + value: -11.061300000000001 + - type: nauc_mrr_at_1_max + value: 39.467400000000005 + - type: nauc_mrr_at_1_std + value: -6.512 + - type: nauc_mrr_at_1_diff1 + value: 57.337700000000005 + - type: nauc_mrr_at_3_max + value: 42.9279 + - type: nauc_mrr_at_3_std + value: -5.251200000000001 + - type: nauc_mrr_at_3_diff1 + value: 54.8802 + - type: nauc_mrr_at_5_max + value: 43.5261 + - type: nauc_mrr_at_5_std + value: -4.4842 + - type: nauc_mrr_at_5_diff1 + value: 54.874500000000005 + - type: nauc_mrr_at_10_max + value: 43.2392 + - type: nauc_mrr_at_10_std + value: -4.2739 + - type: nauc_mrr_at_10_diff1 + value: 54.5466 + - type: nauc_mrr_at_20_max + value: 43.2263 + - type: nauc_mrr_at_20_std + value: -4.122 + - type: nauc_mrr_at_20_diff1 + value: 54.5397 + - type: nauc_mrr_at_100_max + value: 43.2131 + - type: nauc_mrr_at_100_std + value: -4.041 + - type: nauc_mrr_at_100_diff1 + value: 54.586800000000004 + - type: nauc_mrr_at_1000_max + value: 43.2078 + - type: nauc_mrr_at_1000_std + value: -4.0622 + - type: nauc_mrr_at_1000_diff1 + value: 54.606100000000005 + - type: main_score + value: 63.172 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackGisRetrieval (default) + revision: 5003b3064772da1887988e05400cf3806fe491f2 + split: test + type: mteb/cqadupstack-gis + metrics: + - type: ndcg_at_1 + value: 32.429 + - type: ndcg_at_3 + value: 39.639 + - type: ndcg_at_5 + value: 42.051 + - type: ndcg_at_10 + value: 44.759 + - type: ndcg_at_20 + value: 46.588 + - type: ndcg_at_100 + value: 49.457 + - type: ndcg_at_1000 + value: 51.248000000000005 + - type: map_at_1 + value: 30.259999999999998 + - type: map_at_3 + value: 36.998 + - type: map_at_5 + value: 38.452 + - type: map_at_10 + value: 39.653 + - type: map_at_20 + value: 40.199 + - type: map_at_100 + value: 40.63 + - type: map_at_1000 + value: 40.701 + - type: recall_at_1 + value: 30.259999999999998 + - type: recall_at_3 + value: 44.531 + - type: recall_at_5 + value: 50.349999999999994 + - type: recall_at_10 + value: 58.294999999999995 + - type: recall_at_20 + value: 65.19200000000001 + - type: recall_at_100 + value: 79.699 + - type: recall_at_1000 + value: 93.181 + - type: precision_at_1 + value: 32.429 + - type: precision_at_3 + value: 16.61 + - type: precision_at_5 + value: 11.39 + - type: precision_at_10 + value: 6.746 + - type: precision_at_20 + value: 3.8019999999999996 + - type: precision_at_100 + value: 0.963 + - type: precision_at_1000 + value: 0.11399999999999999 + - type: mrr_at_1 + value: 32.4294 + - type: mrr_at_3 + value: 39.265499999999996 + - type: mrr_at_5 + value: 40.6158 + - type: mrr_at_10 + value: 41.7454 + - type: mrr_at_20 + value: 42.187999999999995 + - type: mrr_at_100 + value: 42.530699999999996 + - type: mrr_at_1000 + value: 42.584300000000006 + - type: nauc_ndcg_at_1_max + value: 30.2344 + - type: nauc_ndcg_at_1_std + value: -8.76 + - type: nauc_ndcg_at_1_diff1 + value: 43.3339 + - type: nauc_ndcg_at_3_max + value: 31.300299999999996 + - type: nauc_ndcg_at_3_std + value: -5.2691 + - type: nauc_ndcg_at_3_diff1 + value: 39.6872 + - type: nauc_ndcg_at_5_max + value: 31.844099999999997 + - type: nauc_ndcg_at_5_std + value: -4.228400000000001 + - type: nauc_ndcg_at_5_diff1 + value: 38.2047 + - type: nauc_ndcg_at_10_max + value: 31.664900000000003 + - type: nauc_ndcg_at_10_std + value: -3.2960000000000003 + - type: nauc_ndcg_at_10_diff1 + value: 36.6259 + - type: nauc_ndcg_at_20_max + value: 31.630999999999997 + - type: nauc_ndcg_at_20_std + value: -2.6685 + - type: nauc_ndcg_at_20_diff1 + value: 36.577 + - type: nauc_ndcg_at_100_max + value: 32.283899999999996 + - type: nauc_ndcg_at_100_std + value: -2.1553 + - type: nauc_ndcg_at_100_diff1 + value: 36.3958 + - type: nauc_ndcg_at_1000_max + value: 32.4852 + - type: nauc_ndcg_at_1000_std + value: -2.3408 + - type: nauc_ndcg_at_1000_diff1 + value: 37.0227 + - type: nauc_map_at_1_max + value: 27.620800000000003 + - type: nauc_map_at_1_std + value: -10.7657 + - type: nauc_map_at_1_diff1 + value: 43.7864 + - type: nauc_map_at_3_max + value: 30.0483 + - type: nauc_map_at_3_std + value: -6.9221 + - type: nauc_map_at_3_diff1 + value: 40.826 + - type: nauc_map_at_5_max + value: 30.560399999999998 + - type: nauc_map_at_5_std + value: -6.1894 + - type: nauc_map_at_5_diff1 + value: 40.0042 + - type: nauc_map_at_10_max + value: 30.665100000000002 + - type: nauc_map_at_10_std + value: -5.8472 + - type: nauc_map_at_10_diff1 + value: 39.3857 + - type: nauc_map_at_20_max + value: 30.761699999999998 + - type: nauc_map_at_20_std + value: -5.591 + - type: nauc_map_at_20_diff1 + value: 39.4111 + - type: nauc_map_at_100_max + value: 30.859399999999997 + - type: nauc_map_at_100_std + value: -5.532 + - type: nauc_map_at_100_diff1 + value: 39.3888 + - type: nauc_map_at_1000_max + value: 30.871199999999998 + - type: nauc_map_at_1000_std + value: -5.5322000000000005 + - type: nauc_map_at_1000_diff1 + value: 39.4166 + - type: nauc_recall_at_1_max + value: 27.620800000000003 + - type: nauc_recall_at_1_std + value: -10.7657 + - type: nauc_recall_at_1_diff1 + value: 43.7864 + - type: nauc_recall_at_3_max + value: 31.187199999999997 + - type: nauc_recall_at_3_std + value: -2.5515 + - type: nauc_recall_at_3_diff1 + value: 36.9576 + - type: nauc_recall_at_5_max + value: 32.6827 + - type: nauc_recall_at_5_std + value: -0.4259 + - type: nauc_recall_at_5_diff1 + value: 33.1674 + - type: nauc_recall_at_10_max + value: 31.729400000000002 + - type: nauc_recall_at_10_std + value: 2.8294 + - type: nauc_recall_at_10_diff1 + value: 27.7289 + - type: nauc_recall_at_20_max + value: 30.9251 + - type: nauc_recall_at_20_std + value: 5.9573 + - type: nauc_recall_at_20_diff1 + value: 26.271499999999996 + - type: nauc_recall_at_100_max + value: 35.8557 + - type: nauc_recall_at_100_std + value: 14.478399999999999 + - type: nauc_recall_at_100_diff1 + value: 20.6213 + - type: nauc_recall_at_1000_max + value: 49.7086 + - type: nauc_recall_at_1000_std + value: 36.9282 + - type: nauc_recall_at_1000_diff1 + value: 14.288300000000001 + - type: nauc_precision_at_1_max + value: 30.2344 + - type: nauc_precision_at_1_std + value: -8.76 + - type: nauc_precision_at_1_diff1 + value: 43.3339 + - type: nauc_precision_at_3_max + value: 34.808699999999995 + - type: nauc_precision_at_3_std + value: 0.7861999999999999 + - type: nauc_precision_at_3_diff1 + value: 33.232299999999995 + - type: nauc_precision_at_5_max + value: 35.9325 + - type: nauc_precision_at_5_std + value: 4.1644 + - type: nauc_precision_at_5_diff1 + value: 28.872799999999998 + - type: nauc_precision_at_10_max + value: 34.2471 + - type: nauc_precision_at_10_std + value: 7.2728 + - type: nauc_precision_at_10_diff1 + value: 21.044999999999998 + - type: nauc_precision_at_20_max + value: 31.828200000000002 + - type: nauc_precision_at_20_std + value: 10.2775 + - type: nauc_precision_at_20_diff1 + value: 16.7988 + - type: nauc_precision_at_100_max + value: 26.320100000000004 + - type: nauc_precision_at_100_std + value: 14.0416 + - type: nauc_precision_at_100_diff1 + value: 3.4286999999999996 + - type: nauc_precision_at_1000_max + value: 17.6282 + - type: nauc_precision_at_1000_std + value: 13.1888 + - type: nauc_precision_at_1000_diff1 + value: -6.7075 + - type: nauc_mrr_at_1_max + value: 30.2344 + - type: nauc_mrr_at_1_std + value: -8.76 + - type: nauc_mrr_at_1_diff1 + value: 43.3339 + - type: nauc_mrr_at_3_max + value: 32.2423 + - type: nauc_mrr_at_3_std + value: -4.6264 + - type: nauc_mrr_at_3_diff1 + value: 39.6214 + - type: nauc_mrr_at_5_max + value: 32.496199999999995 + - type: nauc_mrr_at_5_std + value: -4.3406 + - type: nauc_mrr_at_5_diff1 + value: 38.921 + - type: nauc_mrr_at_10_max + value: 32.330799999999996 + - type: nauc_mrr_at_10_std + value: -3.943 + - type: nauc_mrr_at_10_diff1 + value: 38.2251 + - type: nauc_mrr_at_20_max + value: 32.1807 + - type: nauc_mrr_at_20_std + value: -3.9316999999999998 + - type: nauc_mrr_at_20_diff1 + value: 38.2161 + - type: nauc_mrr_at_100_max + value: 32.2413 + - type: nauc_mrr_at_100_std + value: -3.8869000000000002 + - type: nauc_mrr_at_100_diff1 + value: 38.217800000000004 + - type: nauc_mrr_at_1000_max + value: 32.2481 + - type: nauc_mrr_at_1000_std + value: -3.8933000000000004 + - type: nauc_mrr_at_1000_diff1 + value: 38.2515 + - type: main_score + value: 44.759 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackMathematicaRetrieval (default) + revision: 90fceea13679c63fe563ded68f3b6f06e50061de + split: test + type: mteb/cqadupstack-mathematica + metrics: + - type: ndcg_at_1 + value: 22.761 + - type: ndcg_at_3 + value: 27.578999999999997 + - type: ndcg_at_5 + value: 30.067 + - type: ndcg_at_10 + value: 32.823 + - type: ndcg_at_20 + value: 35.129 + - type: ndcg_at_100 + value: 38.903999999999996 + - type: ndcg_at_1000 + value: 41.181 + - type: map_at_1 + value: 18.360000000000003 + - type: map_at_3 + value: 24.264 + - type: map_at_5 + value: 25.844 + - type: map_at_10 + value: 27.093 + - type: map_at_20 + value: 27.839999999999996 + - type: map_at_100 + value: 28.416999999999998 + - type: map_at_1000 + value: 28.517 + - type: recall_at_1 + value: 18.360000000000003 + - type: recall_at_3 + value: 31.044 + - type: recall_at_5 + value: 37.432 + - type: recall_at_10 + value: 45.525999999999996 + - type: recall_at_20 + value: 53.557 + - type: recall_at_100 + value: 72.14500000000001 + - type: recall_at_1000 + value: 88.041 + - type: precision_at_1 + value: 22.761 + - type: precision_at_3 + value: 13.350000000000001 + - type: precision_at_5 + value: 9.801 + - type: precision_at_10 + value: 6.157 + - type: precision_at_20 + value: 3.744 + - type: precision_at_100 + value: 1.055 + - type: precision_at_1000 + value: 0.13799999999999998 + - type: mrr_at_1 + value: 22.761200000000002 + - type: mrr_at_3 + value: 29.187400000000004 + - type: mrr_at_5 + value: 30.866500000000002 + - type: mrr_at_10 + value: 32.0236 + - type: mrr_at_20 + value: 32.5924 + - type: mrr_at_100 + value: 32.995000000000005 + - type: mrr_at_1000 + value: 33.042100000000005 + - type: nauc_ndcg_at_1_max + value: 22.3876 + - type: nauc_ndcg_at_1_std + value: -0.26649999999999996 + - type: nauc_ndcg_at_1_diff1 + value: 42.7688 + - type: nauc_ndcg_at_3_max + value: 24.329 + - type: nauc_ndcg_at_3_std + value: 1.3894 + - type: nauc_ndcg_at_3_diff1 + value: 38.5792 + - type: nauc_ndcg_at_5_max + value: 24.331 + - type: nauc_ndcg_at_5_std + value: 3.1460000000000004 + - type: nauc_ndcg_at_5_diff1 + value: 36.1599 + - type: nauc_ndcg_at_10_max + value: 23.9962 + - type: nauc_ndcg_at_10_std + value: 3.6198 + - type: nauc_ndcg_at_10_diff1 + value: 34.615899999999996 + - type: nauc_ndcg_at_20_max + value: 23.189899999999998 + - type: nauc_ndcg_at_20_std + value: 3.3743000000000003 + - type: nauc_ndcg_at_20_diff1 + value: 34.5344 + - type: nauc_ndcg_at_100_max + value: 24.1644 + - type: nauc_ndcg_at_100_std + value: 5.3245000000000005 + - type: nauc_ndcg_at_100_diff1 + value: 34.1404 + - type: nauc_ndcg_at_1000_max + value: 24.4504 + - type: nauc_ndcg_at_1000_std + value: 5.0385 + - type: nauc_ndcg_at_1000_diff1 + value: 34.3277 + - type: nauc_map_at_1_max + value: 20.5435 + - type: nauc_map_at_1_std + value: -0.1746 + - type: nauc_map_at_1_diff1 + value: 43.252 + - type: nauc_map_at_3_max + value: 23.108999999999998 + - type: nauc_map_at_3_std + value: 0.8848 + - type: nauc_map_at_3_diff1 + value: 39.9259 + - type: nauc_map_at_5_max + value: 23.329900000000002 + - type: nauc_map_at_5_std + value: 1.7795999999999998 + - type: nauc_map_at_5_diff1 + value: 38.448 + - type: nauc_map_at_10_max + value: 23.1789 + - type: nauc_map_at_10_std + value: 2.1036 + - type: nauc_map_at_10_diff1 + value: 37.653 + - type: nauc_map_at_20_max + value: 22.9132 + - type: nauc_map_at_20_std + value: 2.1094 + - type: nauc_map_at_20_diff1 + value: 37.5569 + - type: nauc_map_at_100_max + value: 23.0857 + - type: nauc_map_at_100_std + value: 2.4645 + - type: nauc_map_at_100_diff1 + value: 37.4881 + - type: nauc_map_at_1000_max + value: 23.0988 + - type: nauc_map_at_1000_std + value: 2.4427999999999996 + - type: nauc_map_at_1000_diff1 + value: 37.4707 + - type: nauc_recall_at_1_max + value: 20.5435 + - type: nauc_recall_at_1_std + value: -0.1746 + - type: nauc_recall_at_1_diff1 + value: 43.252 + - type: nauc_recall_at_3_max + value: 24.393500000000003 + - type: nauc_recall_at_3_std + value: 3.3230999999999997 + - type: nauc_recall_at_3_diff1 + value: 34.7983 + - type: nauc_recall_at_5_max + value: 23.4229 + - type: nauc_recall_at_5_std + value: 6.2542 + - type: nauc_recall_at_5_diff1 + value: 28.8147 + - type: nauc_recall_at_10_max + value: 22.6162 + - type: nauc_recall_at_10_std + value: 6.9113 + - type: nauc_recall_at_10_diff1 + value: 24.617900000000002 + - type: nauc_recall_at_20_max + value: 19.8826 + - type: nauc_recall_at_20_std + value: 6.0004 + - type: nauc_recall_at_20_diff1 + value: 24.0887 + - type: nauc_recall_at_100_max + value: 24.428900000000002 + - type: nauc_recall_at_100_std + value: 18.8358 + - type: nauc_recall_at_100_diff1 + value: 18.6841 + - type: nauc_recall_at_1000_max + value: 34.9059 + - type: nauc_recall_at_1000_std + value: 30.6124 + - type: nauc_recall_at_1000_diff1 + value: 11.7067 + - type: nauc_precision_at_1_max + value: 22.3876 + - type: nauc_precision_at_1_std + value: -0.26649999999999996 + - type: nauc_precision_at_1_diff1 + value: 42.7688 + - type: nauc_precision_at_3_max + value: 24.7919 + - type: nauc_precision_at_3_std + value: 1.3971 + - type: nauc_precision_at_3_diff1 + value: 32.175599999999996 + - type: nauc_precision_at_5_max + value: 25.4503 + - type: nauc_precision_at_5_std + value: 4.4636000000000005 + - type: nauc_precision_at_5_diff1 + value: 25.453599999999998 + - type: nauc_precision_at_10_max + value: 21.1404 + - type: nauc_precision_at_10_std + value: 4.7988 + - type: nauc_precision_at_10_diff1 + value: 17.3144 + - type: nauc_precision_at_20_max + value: 16.4733 + - type: nauc_precision_at_20_std + value: 3.7228999999999997 + - type: nauc_precision_at_20_diff1 + value: 12.853 + - type: nauc_precision_at_100_max + value: 12.5551 + - type: nauc_precision_at_100_std + value: 6.2132 + - type: nauc_precision_at_100_diff1 + value: 1.2163 + - type: nauc_precision_at_1000_max + value: 2.706 + - type: nauc_precision_at_1000_std + value: -0.7363999999999999 + - type: nauc_precision_at_1000_diff1 + value: -6.0556 + - type: nauc_mrr_at_1_max + value: 22.3876 + - type: nauc_mrr_at_1_std + value: -0.26649999999999996 + - type: nauc_mrr_at_1_diff1 + value: 42.7688 + - type: nauc_mrr_at_3_max + value: 24.9398 + - type: nauc_mrr_at_3_std + value: 1.5026 + - type: nauc_mrr_at_3_diff1 + value: 39.2078 + - type: nauc_mrr_at_5_max + value: 24.9525 + - type: nauc_mrr_at_5_std + value: 2.2446 + - type: nauc_mrr_at_5_diff1 + value: 37.9502 + - type: nauc_mrr_at_10_max + value: 24.8361 + - type: nauc_mrr_at_10_std + value: 2.1445 + - type: nauc_mrr_at_10_diff1 + value: 37.4108 + - type: nauc_mrr_at_20_max + value: 24.529300000000003 + - type: nauc_mrr_at_20_std + value: 2.0292 + - type: nauc_mrr_at_20_diff1 + value: 37.3959 + - type: nauc_mrr_at_100_max + value: 24.627299999999998 + - type: nauc_mrr_at_100_std + value: 2.2496 + - type: nauc_mrr_at_100_diff1 + value: 37.4236 + - type: nauc_mrr_at_1000_max + value: 24.6481 + - type: nauc_mrr_at_1000_std + value: 2.2540999999999998 + - type: nauc_mrr_at_1000_diff1 + value: 37.4501 + - type: main_score + value: 32.823 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackPhysicsRetrieval (default) + revision: 79531abbd1fb92d06c6d6315a0cbbbf5bb247ea4 + split: test + type: mteb/cqadupstack-physics + metrics: + - type: ndcg_at_1 + value: 40.135 + - type: ndcg_at_3 + value: 45.062999999999995 + - type: ndcg_at_5 + value: 47.674 + - type: ndcg_at_10 + value: 50.312 + - type: ndcg_at_20 + value: 52.349000000000004 + - type: ndcg_at_100 + value: 55.428 + - type: ndcg_at_1000 + value: 57.202 + - type: map_at_1 + value: 32.757 + - type: map_at_3 + value: 40.722 + - type: map_at_5 + value: 42.656 + - type: map_at_10 + value: 44.162 + - type: map_at_20 + value: 44.889 + - type: map_at_100 + value: 45.454 + - type: map_at_1000 + value: 45.562999999999995 + - type: recall_at_1 + value: 32.757 + - type: recall_at_3 + value: 48.120000000000005 + - type: recall_at_5 + value: 54.666000000000004 + - type: recall_at_10 + value: 62.632 + - type: recall_at_20 + value: 69.592 + - type: recall_at_100 + value: 83.863 + - type: recall_at_1000 + value: 95.065 + - type: precision_at_1 + value: 40.135 + - type: precision_at_3 + value: 21.367 + - type: precision_at_5 + value: 15.265 + - type: precision_at_10 + value: 9.057 + - type: precision_at_20 + value: 5.25 + - type: precision_at_100 + value: 1.347 + - type: precision_at_1000 + value: 0.169 + - type: mrr_at_1 + value: 40.1347 + - type: mrr_at_3 + value: 47.3532 + - type: mrr_at_5 + value: 48.8547 + - type: mrr_at_10 + value: 49.9016 + - type: mrr_at_20 + value: 50.31250000000001 + - type: mrr_at_100 + value: 50.6278 + - type: mrr_at_1000 + value: 50.6652 + - type: nauc_ndcg_at_1_max + value: 38.7881 + - type: nauc_ndcg_at_1_std + value: -8.296000000000001 + - type: nauc_ndcg_at_1_diff1 + value: 52.21130000000001 + - type: nauc_ndcg_at_3_max + value: 38.7708 + - type: nauc_ndcg_at_3_std + value: -6.576700000000001 + - type: nauc_ndcg_at_3_diff1 + value: 48.9321 + - type: nauc_ndcg_at_5_max + value: 38.438 + - type: nauc_ndcg_at_5_std + value: -6.2548 + - type: nauc_ndcg_at_5_diff1 + value: 48.0762 + - type: nauc_ndcg_at_10_max + value: 38.365899999999996 + - type: nauc_ndcg_at_10_std + value: -5.7385 + - type: nauc_ndcg_at_10_diff1 + value: 48.158899999999996 + - type: nauc_ndcg_at_20_max + value: 39.0394 + - type: nauc_ndcg_at_20_std + value: -5.0741000000000005 + - type: nauc_ndcg_at_20_diff1 + value: 48.540499999999994 + - type: nauc_ndcg_at_100_max + value: 39.7277 + - type: nauc_ndcg_at_100_std + value: -2.7447 + - type: nauc_ndcg_at_100_diff1 + value: 47.9735 + - type: nauc_ndcg_at_1000_max + value: 40.0211 + - type: nauc_ndcg_at_1000_std + value: -2.7227 + - type: nauc_ndcg_at_1000_diff1 + value: 48.1857 + - type: nauc_map_at_1_max + value: 33.7229 + - type: nauc_map_at_1_std + value: -12.5585 + - type: nauc_map_at_1_diff1 + value: 54.0852 + - type: nauc_map_at_3_max + value: 36.403 + - type: nauc_map_at_3_std + value: -9.1775 + - type: nauc_map_at_3_diff1 + value: 49.7749 + - type: nauc_map_at_5_max + value: 36.804500000000004 + - type: nauc_map_at_5_std + value: -8.4613 + - type: nauc_map_at_5_diff1 + value: 49.1705 + - type: nauc_map_at_10_max + value: 37.3301 + - type: nauc_map_at_10_std + value: -7.706200000000001 + - type: nauc_map_at_10_diff1 + value: 49.3899 + - type: nauc_map_at_20_max + value: 37.541999999999994 + - type: nauc_map_at_20_std + value: -7.4139 + - type: nauc_map_at_20_diff1 + value: 49.4555 + - type: nauc_map_at_100_max + value: 37.7874 + - type: nauc_map_at_100_std + value: -6.8967 + - type: nauc_map_at_100_diff1 + value: 49.336999999999996 + - type: nauc_map_at_1000_max + value: 37.8174 + - type: nauc_map_at_1000_std + value: -6.8435 + - type: nauc_map_at_1000_diff1 + value: 49.3269 + - type: nauc_recall_at_1_max + value: 33.7229 + - type: nauc_recall_at_1_std + value: -12.5585 + - type: nauc_recall_at_1_diff1 + value: 54.0852 + - type: nauc_recall_at_3_max + value: 34.7265 + - type: nauc_recall_at_3_std + value: -8.2544 + - type: nauc_recall_at_3_diff1 + value: 45.2066 + - type: nauc_recall_at_5_max + value: 34.319 + - type: nauc_recall_at_5_std + value: -6.7825 + - type: nauc_recall_at_5_diff1 + value: 41.783 + - type: nauc_recall_at_10_max + value: 34.5308 + - type: nauc_recall_at_10_std + value: -3.8527 + - type: nauc_recall_at_10_diff1 + value: 40.9153 + - type: nauc_recall_at_20_max + value: 36.6563 + - type: nauc_recall_at_20_std + value: -0.6942 + - type: nauc_recall_at_20_diff1 + value: 41.7078 + - type: nauc_recall_at_100_max + value: 38.7406 + - type: nauc_recall_at_100_std + value: 18.8691 + - type: nauc_recall_at_100_diff1 + value: 34.8788 + - type: nauc_recall_at_1000_max + value: 53.96490000000001 + - type: nauc_recall_at_1000_std + value: 46.1526 + - type: nauc_recall_at_1000_diff1 + value: 34.4075 + - type: nauc_precision_at_1_max + value: 38.7881 + - type: nauc_precision_at_1_std + value: -8.296000000000001 + - type: nauc_precision_at_1_diff1 + value: 52.21130000000001 + - type: nauc_precision_at_3_max + value: 38.4296 + - type: nauc_precision_at_3_std + value: 5.1817 + - type: nauc_precision_at_3_diff1 + value: 32.3129 + - type: nauc_precision_at_5_max + value: 33.9238 + - type: nauc_precision_at_5_std + value: 10.5533 + - type: nauc_precision_at_5_diff1 + value: 22.5911 + - type: nauc_precision_at_10_max + value: 30.967 + - type: nauc_precision_at_10_std + value: 16.371 + - type: nauc_precision_at_10_diff1 + value: 15.714 + - type: nauc_precision_at_20_max + value: 27.0551 + - type: nauc_precision_at_20_std + value: 18.2058 + - type: nauc_precision_at_20_diff1 + value: 10.084 + - type: nauc_precision_at_100_max + value: 18.493000000000002 + - type: nauc_precision_at_100_std + value: 25.315199999999997 + - type: nauc_precision_at_100_diff1 + value: -5.4256 + - type: nauc_precision_at_1000_max + value: 6.7 + - type: nauc_precision_at_1000_std + value: 22.2852 + - type: nauc_precision_at_1000_diff1 + value: -14.102 + - type: nauc_mrr_at_1_max + value: 38.7881 + - type: nauc_mrr_at_1_std + value: -8.296000000000001 + - type: nauc_mrr_at_1_diff1 + value: 52.21130000000001 + - type: nauc_mrr_at_3_max + value: 40.9462 + - type: nauc_mrr_at_3_std + value: -5.224 + - type: nauc_mrr_at_3_diff1 + value: 49.9567 + - type: nauc_mrr_at_5_max + value: 40.6606 + - type: nauc_mrr_at_5_std + value: -5.1892000000000005 + - type: nauc_mrr_at_5_diff1 + value: 49.274499999999996 + - type: nauc_mrr_at_10_max + value: 40.7644 + - type: nauc_mrr_at_10_std + value: -4.7934 + - type: nauc_mrr_at_10_diff1 + value: 49.2337 + - type: nauc_mrr_at_20_max + value: 40.8569 + - type: nauc_mrr_at_20_std + value: -4.7076 + - type: nauc_mrr_at_20_diff1 + value: 49.358999999999995 + - type: nauc_mrr_at_100_max + value: 40.8362 + - type: nauc_mrr_at_100_std + value: -4.5678 + - type: nauc_mrr_at_100_diff1 + value: 49.32 + - type: nauc_mrr_at_1000_max + value: 40.827400000000004 + - type: nauc_mrr_at_1000_std + value: -4.5844000000000005 + - type: nauc_mrr_at_1000_diff1 + value: 49.3213 + - type: main_score + value: 50.312 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackProgrammersRetrieval (default) + revision: 6184bc1440d2dbc7612be22b50686b8826d22b32 + split: test + type: mteb/cqadupstack-programmers + metrics: + - type: ndcg_at_1 + value: 38.013999999999996 + - type: ndcg_at_3 + value: 42.824 + - type: ndcg_at_5 + value: 45.074999999999996 + - type: ndcg_at_10 + value: 47.769 + - type: ndcg_at_20 + value: 49.964 + - type: ndcg_at_100 + value: 53.271 + - type: ndcg_at_1000 + value: 55.217000000000006 + - type: map_at_1 + value: 31.751 + - type: map_at_3 + value: 38.95 + - type: map_at_5 + value: 40.681 + - type: map_at_10 + value: 42.097 + - type: map_at_20 + value: 42.892 + - type: map_at_100 + value: 43.472 + - type: map_at_1000 + value: 43.578 + - type: recall_at_1 + value: 31.751 + - type: recall_at_3 + value: 45.409 + - type: recall_at_5 + value: 51.373000000000005 + - type: recall_at_10 + value: 59.168 + - type: recall_at_20 + value: 66.669 + - type: recall_at_100 + value: 82.26400000000001 + - type: recall_at_1000 + value: 95.017 + - type: precision_at_1 + value: 38.013999999999996 + - type: precision_at_3 + value: 19.977 + - type: precision_at_5 + value: 14.11 + - type: precision_at_10 + value: 8.493 + - type: precision_at_20 + value: 5.0 + - type: precision_at_100 + value: 1.312 + - type: precision_at_1000 + value: 0.165 + - type: mrr_at_1 + value: 38.0137 + - type: mrr_at_3 + value: 44.9772 + - type: mrr_at_5 + value: 46.387 + - type: mrr_at_10 + value: 47.384100000000004 + - type: mrr_at_20 + value: 47.8746 + - type: mrr_at_100 + value: 48.2235 + - type: mrr_at_1000 + value: 48.2699 + - type: nauc_ndcg_at_1_max + value: 35.9967 + - type: nauc_ndcg_at_1_std + value: 4.926500000000001 + - type: nauc_ndcg_at_1_diff1 + value: 43.5414 + - type: nauc_ndcg_at_3_max + value: 35.4574 + - type: nauc_ndcg_at_3_std + value: 2.6951 + - type: nauc_ndcg_at_3_diff1 + value: 38.5888 + - type: nauc_ndcg_at_5_max + value: 35.7783 + - type: nauc_ndcg_at_5_std + value: 3.5970000000000004 + - type: nauc_ndcg_at_5_diff1 + value: 38.107 + - type: nauc_ndcg_at_10_max + value: 35.9047 + - type: nauc_ndcg_at_10_std + value: 5.3849 + - type: nauc_ndcg_at_10_diff1 + value: 37.6917 + - type: nauc_ndcg_at_20_max + value: 37.4203 + - type: nauc_ndcg_at_20_std + value: 7.5072 + - type: nauc_ndcg_at_20_diff1 + value: 37.9429 + - type: nauc_ndcg_at_100_max + value: 37.913000000000004 + - type: nauc_ndcg_at_100_std + value: 8.8726 + - type: nauc_ndcg_at_100_diff1 + value: 37.8018 + - type: nauc_ndcg_at_1000_max + value: 37.7521 + - type: nauc_ndcg_at_1000_std + value: 8.0898 + - type: nauc_ndcg_at_1000_diff1 + value: 38.188 + - type: nauc_map_at_1_max + value: 30.6039 + - type: nauc_map_at_1_std + value: -1.1973 + - type: nauc_map_at_1_diff1 + value: 44.4956 + - type: nauc_map_at_3_max + value: 33.79 + - type: nauc_map_at_3_std + value: 0.7224999999999999 + - type: nauc_map_at_3_diff1 + value: 40.5918 + - type: nauc_map_at_5_max + value: 34.799 + - type: nauc_map_at_5_std + value: 1.9663 + - type: nauc_map_at_5_diff1 + value: 40.119 + - type: nauc_map_at_10_max + value: 35.0036 + - type: nauc_map_at_10_std + value: 2.9479 + - type: nauc_map_at_10_diff1 + value: 39.725899999999996 + - type: nauc_map_at_20_max + value: 35.6907 + - type: nauc_map_at_20_std + value: 3.7684 + - type: nauc_map_at_20_diff1 + value: 39.6845 + - type: nauc_map_at_100_max + value: 35.8249 + - type: nauc_map_at_100_std + value: 4.123 + - type: nauc_map_at_100_diff1 + value: 39.6397 + - type: nauc_map_at_1000_max + value: 35.8146 + - type: nauc_map_at_1000_std + value: 4.100899999999999 + - type: nauc_map_at_1000_diff1 + value: 39.6511 + - type: nauc_recall_at_1_max + value: 30.6039 + - type: nauc_recall_at_1_std + value: -1.1973 + - type: nauc_recall_at_1_diff1 + value: 44.4956 + - type: nauc_recall_at_3_max + value: 33.9619 + - type: nauc_recall_at_3_std + value: 1.3599 + - type: nauc_recall_at_3_diff1 + value: 36.673899999999996 + - type: nauc_recall_at_5_max + value: 34.798899999999996 + - type: nauc_recall_at_5_std + value: 3.9083 + - type: nauc_recall_at_5_diff1 + value: 34.2275 + - type: nauc_recall_at_10_max + value: 34.3508 + - type: nauc_recall_at_10_std + value: 8.6454 + - type: nauc_recall_at_10_diff1 + value: 31.9422 + - type: nauc_recall_at_20_max + value: 39.1475 + - type: nauc_recall_at_20_std + value: 17.0303 + - type: nauc_recall_at_20_diff1 + value: 32.138099999999994 + - type: nauc_recall_at_100_max + value: 43.452 + - type: nauc_recall_at_100_std + value: 31.8449 + - type: nauc_recall_at_100_diff1 + value: 27.38 + - type: nauc_recall_at_1000_max + value: 56.720000000000006 + - type: nauc_recall_at_1000_std + value: 51.5088 + - type: nauc_recall_at_1000_diff1 + value: 28.131099999999996 + - type: nauc_precision_at_1_max + value: 35.9967 + - type: nauc_precision_at_1_std + value: 4.926500000000001 + - type: nauc_precision_at_1_diff1 + value: 43.5414 + - type: nauc_precision_at_3_max + value: 36.204 + - type: nauc_precision_at_3_std + value: 9.6793 + - type: nauc_precision_at_3_diff1 + value: 22.8807 + - type: nauc_precision_at_5_max + value: 34.226 + - type: nauc_precision_at_5_std + value: 14.0818 + - type: nauc_precision_at_5_diff1 + value: 16.223000000000003 + - type: nauc_precision_at_10_max + value: 28.3789 + - type: nauc_precision_at_10_std + value: 18.8125 + - type: nauc_precision_at_10_diff1 + value: 7.382700000000001 + - type: nauc_precision_at_20_max + value: 26.151600000000002 + - type: nauc_precision_at_20_std + value: 22.352 + - type: nauc_precision_at_20_diff1 + value: 1.0934 + - type: nauc_precision_at_100_max + value: 13.886399999999998 + - type: nauc_precision_at_100_std + value: 21.5356 + - type: nauc_precision_at_100_diff1 + value: -10.3265 + - type: nauc_precision_at_1000_max + value: -1.5730000000000002 + - type: nauc_precision_at_1000_std + value: 9.9943 + - type: nauc_precision_at_1000_diff1 + value: -18.5193 + - type: nauc_mrr_at_1_max + value: 35.9967 + - type: nauc_mrr_at_1_std + value: 4.926500000000001 + - type: nauc_mrr_at_1_diff1 + value: 43.5414 + - type: nauc_mrr_at_3_max + value: 37.1377 + - type: nauc_mrr_at_3_std + value: 5.6196 + - type: nauc_mrr_at_3_diff1 + value: 38.9643 + - type: nauc_mrr_at_5_max + value: 36.945499999999996 + - type: nauc_mrr_at_5_std + value: 5.9594000000000005 + - type: nauc_mrr_at_5_diff1 + value: 38.431 + - type: nauc_mrr_at_10_max + value: 37.094300000000004 + - type: nauc_mrr_at_10_std + value: 6.6665 + - type: nauc_mrr_at_10_diff1 + value: 38.4148 + - type: nauc_mrr_at_20_max + value: 37.283100000000005 + - type: nauc_mrr_at_20_std + value: 7.0301 + - type: nauc_mrr_at_20_diff1 + value: 38.6425 + - type: nauc_mrr_at_100_max + value: 37.312200000000004 + - type: nauc_mrr_at_100_std + value: 7.0826 + - type: nauc_mrr_at_100_diff1 + value: 38.689800000000005 + - type: nauc_mrr_at_1000_max + value: 37.319 + - type: nauc_mrr_at_1000_std + value: 7.0653999999999995 + - type: nauc_mrr_at_1000_diff1 + value: 38.7106 + - type: main_score + value: 47.769 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackRetrieval (default) + revision: CQADupstackRetrieval_is_a_combined_dataset + split: test + type: CQADupstackRetrieval_is_a_combined_dataset + metrics: + - type: main_score + value: 46.10300000000001 + - type: ndcg_at_10 + value: 46.10300000000001 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackStatsRetrieval (default) + revision: 65ac3a16b8e91f9cee4c9828cc7c335575432a2a + split: test + type: mteb/cqadupstack-stats + metrics: + - type: ndcg_at_1 + value: 32.362 + - type: ndcg_at_3 + value: 36.026 + - type: ndcg_at_5 + value: 38.122 + - type: ndcg_at_10 + value: 40.174 + - type: ndcg_at_20 + value: 41.836 + - type: ndcg_at_100 + value: 44.444 + - type: ndcg_at_1000 + value: 46.929 + - type: map_at_1 + value: 28.871999999999996 + - type: map_at_3 + value: 33.613 + - type: map_at_5 + value: 35.007 + - type: map_at_10 + value: 35.976 + - type: map_at_20 + value: 36.496 + - type: map_at_100 + value: 36.895 + - type: map_at_1000 + value: 36.994 + - type: recall_at_1 + value: 28.871999999999996 + - type: recall_at_3 + value: 38.705 + - type: recall_at_5 + value: 43.821 + - type: recall_at_10 + value: 49.921 + - type: recall_at_20 + value: 56.163 + - type: recall_at_100 + value: 69.084 + - type: recall_at_1000 + value: 87.35000000000001 + - type: precision_at_1 + value: 32.362 + - type: precision_at_3 + value: 15.184000000000001 + - type: precision_at_5 + value: 10.583 + - type: precision_at_10 + value: 6.166 + - type: precision_at_20 + value: 3.512 + - type: precision_at_100 + value: 0.897 + - type: precision_at_1000 + value: 0.11900000000000001 + - type: mrr_at_1 + value: 32.362 + - type: mrr_at_3 + value: 36.937599999999996 + - type: mrr_at_5 + value: 38.1416 + - type: mrr_at_10 + value: 39.012299999999996 + - type: mrr_at_20 + value: 39.4119 + - type: mrr_at_100 + value: 39.745200000000004 + - type: mrr_at_1000 + value: 39.8191 + - type: nauc_ndcg_at_1_max + value: 39.396300000000004 + - type: nauc_ndcg_at_1_std + value: 0.8482 + - type: nauc_ndcg_at_1_diff1 + value: 52.376999999999995 + - type: nauc_ndcg_at_3_max + value: 39.0785 + - type: nauc_ndcg_at_3_std + value: 3.2739 + - type: nauc_ndcg_at_3_diff1 + value: 48.3207 + - type: nauc_ndcg_at_5_max + value: 38.4648 + - type: nauc_ndcg_at_5_std + value: 3.3379 + - type: nauc_ndcg_at_5_diff1 + value: 47.468500000000006 + - type: nauc_ndcg_at_10_max + value: 39.0329 + - type: nauc_ndcg_at_10_std + value: 4.0895 + - type: nauc_ndcg_at_10_diff1 + value: 46.1268 + - type: nauc_ndcg_at_20_max + value: 38.359 + - type: nauc_ndcg_at_20_std + value: 4.2744 + - type: nauc_ndcg_at_20_diff1 + value: 45.1661 + - type: nauc_ndcg_at_100_max + value: 39.461 + - type: nauc_ndcg_at_100_std + value: 7.2038 + - type: nauc_ndcg_at_100_diff1 + value: 44.809 + - type: nauc_ndcg_at_1000_max + value: 39.875699999999995 + - type: nauc_ndcg_at_1000_std + value: 6.9621 + - type: nauc_ndcg_at_1000_diff1 + value: 45.473200000000006 + - type: nauc_map_at_1_max + value: 35.936800000000005 + - type: nauc_map_at_1_std + value: -3.2637 + - type: nauc_map_at_1_diff1 + value: 52.3431 + - type: nauc_map_at_3_max + value: 37.8006 + - type: nauc_map_at_3_std + value: 0.7727999999999999 + - type: nauc_map_at_3_diff1 + value: 49.1872 + - type: nauc_map_at_5_max + value: 37.932300000000005 + - type: nauc_map_at_5_std + value: 1.4745 + - type: nauc_map_at_5_diff1 + value: 48.8466 + - type: nauc_map_at_10_max + value: 38.4041 + - type: nauc_map_at_10_std + value: 2.0481 + - type: nauc_map_at_10_diff1 + value: 48.2292 + - type: nauc_map_at_20_max + value: 38.1992 + - type: nauc_map_at_20_std + value: 2.1198 + - type: nauc_map_at_20_diff1 + value: 47.9169 + - type: nauc_map_at_100_max + value: 38.3504 + - type: nauc_map_at_100_std + value: 2.5100000000000002 + - type: nauc_map_at_100_diff1 + value: 47.8259 + - type: nauc_map_at_1000_max + value: 38.3865 + - type: nauc_map_at_1000_std + value: 2.5181999999999998 + - type: nauc_map_at_1000_diff1 + value: 47.853699999999996 + - type: nauc_recall_at_1_max + value: 35.936800000000005 + - type: nauc_recall_at_1_std + value: -3.2637 + - type: nauc_recall_at_1_diff1 + value: 52.3431 + - type: nauc_recall_at_3_max + value: 37.227700000000006 + - type: nauc_recall_at_3_std + value: 3.8813 + - type: nauc_recall_at_3_diff1 + value: 44.8185 + - type: nauc_recall_at_5_max + value: 35.963 + - type: nauc_recall_at_5_std + value: 4.9497 + - type: nauc_recall_at_5_diff1 + value: 42.6322 + - type: nauc_recall_at_10_max + value: 37.358000000000004 + - type: nauc_recall_at_10_std + value: 6.6888000000000005 + - type: nauc_recall_at_10_diff1 + value: 38.7639 + - type: nauc_recall_at_20_max + value: 34.2341 + - type: nauc_recall_at_20_std + value: 7.0213 + - type: nauc_recall_at_20_diff1 + value: 34.8021 + - type: nauc_recall_at_100_max + value: 39.406600000000005 + - type: nauc_recall_at_100_std + value: 25.7393 + - type: nauc_recall_at_100_diff1 + value: 29.9173 + - type: nauc_recall_at_1000_max + value: 45.287 + - type: nauc_recall_at_1000_std + value: 38.572 + - type: nauc_recall_at_1000_diff1 + value: 26.744 + - type: nauc_precision_at_1_max + value: 39.396300000000004 + - type: nauc_precision_at_1_std + value: 0.8482 + - type: nauc_precision_at_1_diff1 + value: 52.376999999999995 + - type: nauc_precision_at_3_max + value: 42.1919 + - type: nauc_precision_at_3_std + value: 13.9189 + - type: nauc_precision_at_3_diff1 + value: 40.2337 + - type: nauc_precision_at_5_max + value: 39.8644 + - type: nauc_precision_at_5_std + value: 15.656900000000002 + - type: nauc_precision_at_5_diff1 + value: 35.1421 + - type: nauc_precision_at_10_max + value: 40.7678 + - type: nauc_precision_at_10_std + value: 19.5881 + - type: nauc_precision_at_10_diff1 + value: 28.822300000000002 + - type: nauc_precision_at_20_max + value: 35.4842 + - type: nauc_precision_at_20_std + value: 20.6978 + - type: nauc_precision_at_20_diff1 + value: 21.4608 + - type: nauc_precision_at_100_max + value: 33.211400000000005 + - type: nauc_precision_at_100_std + value: 31.5029 + - type: nauc_precision_at_100_diff1 + value: 13.0526 + - type: nauc_precision_at_1000_max + value: 21.6976 + - type: nauc_precision_at_1000_std + value: 26.4203 + - type: nauc_precision_at_1000_diff1 + value: 2.6056 + - type: nauc_mrr_at_1_max + value: 39.396300000000004 + - type: nauc_mrr_at_1_std + value: 0.8482 + - type: nauc_mrr_at_1_diff1 + value: 52.376999999999995 + - type: nauc_mrr_at_3_max + value: 40.191 + - type: nauc_mrr_at_3_std + value: 3.9919999999999995 + - type: nauc_mrr_at_3_diff1 + value: 49.2714 + - type: nauc_mrr_at_5_max + value: 39.9654 + - type: nauc_mrr_at_5_std + value: 4.0258 + - type: nauc_mrr_at_5_diff1 + value: 48.6599 + - type: nauc_mrr_at_10_max + value: 40.1413 + - type: nauc_mrr_at_10_std + value: 4.389 + - type: nauc_mrr_at_10_diff1 + value: 48.0272 + - type: nauc_mrr_at_20_max + value: 39.9265 + - type: nauc_mrr_at_20_std + value: 4.3462 + - type: nauc_mrr_at_20_diff1 + value: 47.8592 + - type: nauc_mrr_at_100_max + value: 40.0623 + - type: nauc_mrr_at_100_std + value: 4.698 + - type: nauc_mrr_at_100_diff1 + value: 47.8456 + - type: nauc_mrr_at_1000_max + value: 40.0698 + - type: nauc_mrr_at_1000_std + value: 4.6803 + - type: nauc_mrr_at_1000_diff1 + value: 47.8659 + - type: main_score + value: 40.174 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackTexRetrieval (default) + revision: 46989137a86843e03a6195de44b09deda022eec7 + split: test + type: mteb/cqadupstack-tex + metrics: + - type: ndcg_at_1 + value: 25.155 + - type: ndcg_at_3 + value: 29.339 + - type: ndcg_at_5 + value: 31.452999999999996 + - type: ndcg_at_10 + value: 33.937 + - type: ndcg_at_20 + value: 36.018 + - type: ndcg_at_100 + value: 39.531 + - type: ndcg_at_1000 + value: 42.22 + - type: map_at_1 + value: 20.874000000000002 + - type: map_at_3 + value: 26.345000000000002 + - type: map_at_5 + value: 27.773999999999997 + - type: map_at_10 + value: 28.965999999999998 + - type: map_at_20 + value: 29.625 + - type: map_at_100 + value: 30.188 + - type: map_at_1000 + value: 30.314000000000004 + - type: recall_at_1 + value: 20.874000000000002 + - type: recall_at_3 + value: 31.984 + - type: recall_at_5 + value: 37.467 + - type: recall_at_10 + value: 44.774 + - type: recall_at_20 + value: 52.323 + - type: recall_at_100 + value: 69.549 + - type: recall_at_1000 + value: 88.419 + - type: precision_at_1 + value: 25.155 + - type: precision_at_3 + value: 13.719000000000001 + - type: precision_at_5 + value: 9.841999999999999 + - type: precision_at_10 + value: 6.069999999999999 + - type: precision_at_20 + value: 3.6799999999999997 + - type: precision_at_100 + value: 1.045 + - type: precision_at_1000 + value: 0.146 + - type: mrr_at_1 + value: 25.1549 + - type: mrr_at_3 + value: 30.7123 + - type: mrr_at_5 + value: 32.0148 + - type: mrr_at_10 + value: 33.035199999999996 + - type: mrr_at_20 + value: 33.5778 + - type: mrr_at_100 + value: 34.0001 + - type: mrr_at_1000 + value: 34.070499999999996 + - type: nauc_ndcg_at_1_max + value: 34.6903 + - type: nauc_ndcg_at_1_std + value: -0.48469999999999996 + - type: nauc_ndcg_at_1_diff1 + value: 41.827799999999996 + - type: nauc_ndcg_at_3_max + value: 34.7107 + - type: nauc_ndcg_at_3_std + value: 1.2525 + - type: nauc_ndcg_at_3_diff1 + value: 36.09 + - type: nauc_ndcg_at_5_max + value: 34.363899999999994 + - type: nauc_ndcg_at_5_std + value: 1.187 + - type: nauc_ndcg_at_5_diff1 + value: 35.5019 + - type: nauc_ndcg_at_10_max + value: 34.1261 + - type: nauc_ndcg_at_10_std + value: 2.0704000000000002 + - type: nauc_ndcg_at_10_diff1 + value: 35.0098 + - type: nauc_ndcg_at_20_max + value: 34.5028 + - type: nauc_ndcg_at_20_std + value: 2.9973 + - type: nauc_ndcg_at_20_diff1 + value: 34.6486 + - type: nauc_ndcg_at_100_max + value: 34.8192 + - type: nauc_ndcg_at_100_std + value: 4.4281 + - type: nauc_ndcg_at_100_diff1 + value: 34.252500000000005 + - type: nauc_ndcg_at_1000_max + value: 34.8293 + - type: nauc_ndcg_at_1000_std + value: 4.2747 + - type: nauc_ndcg_at_1000_diff1 + value: 34.5083 + - type: nauc_map_at_1_max + value: 31.448700000000002 + - type: nauc_map_at_1_std + value: -1.5652 + - type: nauc_map_at_1_diff1 + value: 42.3532 + - type: nauc_map_at_3_max + value: 33.458 + - type: nauc_map_at_3_std + value: 0.372 + - type: nauc_map_at_3_diff1 + value: 37.6257 + - type: nauc_map_at_5_max + value: 33.3902 + - type: nauc_map_at_5_std + value: 0.2957 + - type: nauc_map_at_5_diff1 + value: 37.0708 + - type: nauc_map_at_10_max + value: 33.4473 + - type: nauc_map_at_10_std + value: 0.7451 + - type: nauc_map_at_10_diff1 + value: 36.7872 + - type: nauc_map_at_20_max + value: 33.6705 + - type: nauc_map_at_20_std + value: 1.0755000000000001 + - type: nauc_map_at_20_diff1 + value: 36.6791 + - type: nauc_map_at_100_max + value: 33.772200000000005 + - type: nauc_map_at_100_std + value: 1.308 + - type: nauc_map_at_100_diff1 + value: 36.5896 + - type: nauc_map_at_1000_max + value: 33.7881 + - type: nauc_map_at_1000_std + value: 1.3087 + - type: nauc_map_at_1000_diff1 + value: 36.5978 + - type: nauc_recall_at_1_max + value: 31.448700000000002 + - type: nauc_recall_at_1_std + value: -1.5652 + - type: nauc_recall_at_1_diff1 + value: 42.3532 + - type: nauc_recall_at_3_max + value: 33.7171 + - type: nauc_recall_at_3_std + value: 2.4527 + - type: nauc_recall_at_3_diff1 + value: 32.6832 + - type: nauc_recall_at_5_max + value: 32.7828 + - type: nauc_recall_at_5_std + value: 2.0332 + - type: nauc_recall_at_5_diff1 + value: 30.8446 + - type: nauc_recall_at_10_max + value: 31.6463 + - type: nauc_recall_at_10_std + value: 4.3727 + - type: nauc_recall_at_10_diff1 + value: 29.1731 + - type: nauc_recall_at_20_max + value: 31.968999999999998 + - type: nauc_recall_at_20_std + value: 7.5392 + - type: nauc_recall_at_20_diff1 + value: 26.961299999999998 + - type: nauc_recall_at_100_max + value: 32.9142 + - type: nauc_recall_at_100_std + value: 17.2332 + - type: nauc_recall_at_100_diff1 + value: 22.0707 + - type: nauc_recall_at_1000_max + value: 32.1463 + - type: nauc_recall_at_1000_std + value: 29.664600000000004 + - type: nauc_recall_at_1000_diff1 + value: 13.9131 + - type: nauc_precision_at_1_max + value: 34.6903 + - type: nauc_precision_at_1_std + value: -0.48469999999999996 + - type: nauc_precision_at_1_diff1 + value: 41.827799999999996 + - type: nauc_precision_at_3_max + value: 36.8823 + - type: nauc_precision_at_3_std + value: 3.7052 + - type: nauc_precision_at_3_diff1 + value: 29.505599999999998 + - type: nauc_precision_at_5_max + value: 35.106 + - type: nauc_precision_at_5_std + value: 3.9923 + - type: nauc_precision_at_5_diff1 + value: 25.684099999999997 + - type: nauc_precision_at_10_max + value: 32.1139 + - type: nauc_precision_at_10_std + value: 7.097100000000001 + - type: nauc_precision_at_10_diff1 + value: 20.521 + - type: nauc_precision_at_20_max + value: 30.3506 + - type: nauc_precision_at_20_std + value: 9.7899 + - type: nauc_precision_at_20_diff1 + value: 16.106 + - type: nauc_precision_at_100_max + value: 23.7062 + - type: nauc_precision_at_100_std + value: 12.7852 + - type: nauc_precision_at_100_diff1 + value: 5.9668 + - type: nauc_precision_at_1000_max + value: 13.6273 + - type: nauc_precision_at_1000_std + value: 7.0956 + - type: nauc_precision_at_1000_diff1 + value: -3.6863 + - type: nauc_mrr_at_1_max + value: 34.6903 + - type: nauc_mrr_at_1_std + value: -0.48469999999999996 + - type: nauc_mrr_at_1_diff1 + value: 41.827799999999996 + - type: nauc_mrr_at_3_max + value: 35.826 + - type: nauc_mrr_at_3_std + value: 1.3141999999999998 + - type: nauc_mrr_at_3_diff1 + value: 37.1995 + - type: nauc_mrr_at_5_max + value: 35.6178 + - type: nauc_mrr_at_5_std + value: 1.3211 + - type: nauc_mrr_at_5_diff1 + value: 36.8396 + - type: nauc_mrr_at_10_max + value: 35.4784 + - type: nauc_mrr_at_10_std + value: 1.6153 + - type: nauc_mrr_at_10_diff1 + value: 36.6262 + - type: nauc_mrr_at_20_max + value: 35.5478 + - type: nauc_mrr_at_20_std + value: 1.8614 + - type: nauc_mrr_at_20_diff1 + value: 36.5754 + - type: nauc_mrr_at_100_max + value: 35.5825 + - type: nauc_mrr_at_100_std + value: 1.9792 + - type: nauc_mrr_at_100_diff1 + value: 36.5758 + - type: nauc_mrr_at_1000_max + value: 35.5811 + - type: nauc_mrr_at_1000_std + value: 1.9691 + - type: nauc_mrr_at_1000_diff1 + value: 36.587399999999995 + - type: main_score + value: 33.937 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackUnixRetrieval (default) + revision: 6c6430d3a6d36f8d2a829195bc5dc94d7e063e53 + split: test + type: mteb/cqadupstack-unix + metrics: + - type: ndcg_at_1 + value: 36.381 + - type: ndcg_at_3 + value: 41.605 + - type: ndcg_at_5 + value: 43.854 + - type: ndcg_at_10 + value: 46.831 + - type: ndcg_at_20 + value: 49.114999999999995 + - type: ndcg_at_100 + value: 52.071 + - type: ndcg_at_1000 + value: 53.864999999999995 + - type: map_at_1 + value: 30.957 + - type: map_at_3 + value: 38.074999999999996 + - type: map_at_5 + value: 39.732 + - type: map_at_10 + value: 41.187000000000005 + - type: map_at_20 + value: 41.94 + - type: map_at_100 + value: 42.447 + - type: map_at_1000 + value: 42.536 + - type: recall_at_1 + value: 30.957 + - type: recall_at_3 + value: 45.213 + - type: recall_at_5 + value: 51.196 + - type: recall_at_10 + value: 59.724 + - type: recall_at_20 + value: 67.837 + - type: recall_at_100 + value: 81.843 + - type: recall_at_1000 + value: 93.91000000000001 + - type: precision_at_1 + value: 36.381 + - type: precision_at_3 + value: 18.999 + - type: precision_at_5 + value: 13.172 + - type: precision_at_10 + value: 7.938000000000001 + - type: precision_at_20 + value: 4.6129999999999995 + - type: precision_at_100 + value: 1.172 + - type: precision_at_1000 + value: 0.14300000000000002 + - type: mrr_at_1 + value: 36.3806 + - type: mrr_at_3 + value: 42.7239 + - type: mrr_at_5 + value: 44.0905 + - type: mrr_at_10 + value: 45.2951 + - type: mrr_at_20 + value: 45.8788 + - type: mrr_at_100 + value: 46.1807 + - type: mrr_at_1000 + value: 46.226800000000004 + - type: nauc_ndcg_at_1_max + value: 47.0214 + - type: nauc_ndcg_at_1_std + value: -0.8086 + - type: nauc_ndcg_at_1_diff1 + value: 55.931200000000004 + - type: nauc_ndcg_at_3_max + value: 44.829299999999996 + - type: nauc_ndcg_at_3_std + value: 0.6224000000000001 + - type: nauc_ndcg_at_3_diff1 + value: 49.7765 + - type: nauc_ndcg_at_5_max + value: 44.3325 + - type: nauc_ndcg_at_5_std + value: 0.1854 + - type: nauc_ndcg_at_5_diff1 + value: 49.0426 + - type: nauc_ndcg_at_10_max + value: 44.358599999999996 + - type: nauc_ndcg_at_10_std + value: 0.6905 + - type: nauc_ndcg_at_10_diff1 + value: 48.1902 + - type: nauc_ndcg_at_20_max + value: 45.018 + - type: nauc_ndcg_at_20_std + value: 1.555 + - type: nauc_ndcg_at_20_diff1 + value: 48.2645 + - type: nauc_ndcg_at_100_max + value: 45.3244 + - type: nauc_ndcg_at_100_std + value: 3.0655 + - type: nauc_ndcg_at_100_diff1 + value: 48.1011 + - type: nauc_ndcg_at_1000_max + value: 45.2297 + - type: nauc_ndcg_at_1000_std + value: 2.5452 + - type: nauc_ndcg_at_1000_diff1 + value: 48.4179 + - type: nauc_map_at_1_max + value: 44.1846 + - type: nauc_map_at_1_std + value: -2.661 + - type: nauc_map_at_1_diff1 + value: 58.4395 + - type: nauc_map_at_3_max + value: 44.7697 + - type: nauc_map_at_3_std + value: -0.3776 + - type: nauc_map_at_3_diff1 + value: 52.7119 + - type: nauc_map_at_5_max + value: 44.6708 + - type: nauc_map_at_5_std + value: -0.4622 + - type: nauc_map_at_5_diff1 + value: 51.8622 + - type: nauc_map_at_10_max + value: 44.7631 + - type: nauc_map_at_10_std + value: -0.2403 + - type: nauc_map_at_10_diff1 + value: 51.439299999999996 + - type: nauc_map_at_20_max + value: 45.0612 + - type: nauc_map_at_20_std + value: 0.0038000000000000004 + - type: nauc_map_at_20_diff1 + value: 51.3768 + - type: nauc_map_at_100_max + value: 45.137 + - type: nauc_map_at_100_std + value: 0.2717 + - type: nauc_map_at_100_diff1 + value: 51.316700000000004 + - type: nauc_map_at_1000_max + value: 45.1229 + - type: nauc_map_at_1000_std + value: 0.2513 + - type: nauc_map_at_1000_diff1 + value: 51.3133 + - type: nauc_recall_at_1_max + value: 44.1846 + - type: nauc_recall_at_1_std + value: -2.661 + - type: nauc_recall_at_1_diff1 + value: 58.4395 + - type: nauc_recall_at_3_max + value: 41.656 + - type: nauc_recall_at_3_std + value: 1.6587999999999998 + - type: nauc_recall_at_3_diff1 + value: 44.9322 + - type: nauc_recall_at_5_max + value: 40.501 + - type: nauc_recall_at_5_std + value: 1.1215 + - type: nauc_recall_at_5_diff1 + value: 41.7702 + - type: nauc_recall_at_10_max + value: 39.577400000000004 + - type: nauc_recall_at_10_std + value: 2.172 + - type: nauc_recall_at_10_diff1 + value: 38.0253 + - type: nauc_recall_at_20_max + value: 41.1537 + - type: nauc_recall_at_20_std + value: 6.1195 + - type: nauc_recall_at_20_diff1 + value: 37.391400000000004 + - type: nauc_recall_at_100_max + value: 42.2577 + - type: nauc_recall_at_100_std + value: 20.7745 + - type: nauc_recall_at_100_diff1 + value: 32.8151 + - type: nauc_recall_at_1000_max + value: 43.5594 + - type: nauc_recall_at_1000_std + value: 37.6573 + - type: nauc_recall_at_1000_diff1 + value: 29.7545 + - type: nauc_precision_at_1_max + value: 47.0214 + - type: nauc_precision_at_1_std + value: -0.8086 + - type: nauc_precision_at_1_diff1 + value: 55.931200000000004 + - type: nauc_precision_at_3_max + value: 39.4995 + - type: nauc_precision_at_3_std + value: 5.0051 + - type: nauc_precision_at_3_diff1 + value: 32.0456 + - type: nauc_precision_at_5_max + value: 34.972500000000004 + - type: nauc_precision_at_5_std + value: 5.1238 + - type: nauc_precision_at_5_diff1 + value: 24.2515 + - type: nauc_precision_at_10_max + value: 28.364099999999997 + - type: nauc_precision_at_10_std + value: 6.0539000000000005 + - type: nauc_precision_at_10_diff1 + value: 14.192599999999999 + - type: nauc_precision_at_20_max + value: 25.7353 + - type: nauc_precision_at_20_std + value: 8.860999999999999 + - type: nauc_precision_at_20_diff1 + value: 7.0925 + - type: nauc_precision_at_100_max + value: 11.8965 + - type: nauc_precision_at_100_std + value: 13.143099999999999 + - type: nauc_precision_at_100_diff1 + value: -8.5811 + - type: nauc_precision_at_1000_max + value: -3.7232000000000003 + - type: nauc_precision_at_1000_std + value: 6.392 + - type: nauc_precision_at_1000_diff1 + value: -20.5151 + - type: nauc_mrr_at_1_max + value: 47.0214 + - type: nauc_mrr_at_1_std + value: -0.8086 + - type: nauc_mrr_at_1_diff1 + value: 55.931200000000004 + - type: nauc_mrr_at_3_max + value: 45.6591 + - type: nauc_mrr_at_3_std + value: 0.6383 + - type: nauc_mrr_at_3_diff1 + value: 50.0407 + - type: nauc_mrr_at_5_max + value: 45.7236 + - type: nauc_mrr_at_5_std + value: 0.5502 + - type: nauc_mrr_at_5_diff1 + value: 49.6432 + - type: nauc_mrr_at_10_max + value: 45.6287 + - type: nauc_mrr_at_10_std + value: 0.6239 + - type: nauc_mrr_at_10_diff1 + value: 49.391200000000005 + - type: nauc_mrr_at_20_max + value: 45.704899999999995 + - type: nauc_mrr_at_20_std + value: 0.7987 + - type: nauc_mrr_at_20_diff1 + value: 49.4844 + - type: nauc_mrr_at_100_max + value: 45.708 + - type: nauc_mrr_at_100_std + value: 0.8823 + - type: nauc_mrr_at_100_diff1 + value: 49.5323 + - type: nauc_mrr_at_1000_max + value: 45.7135 + - type: nauc_mrr_at_1000_std + value: 0.8635999999999999 + - type: nauc_mrr_at_1000_diff1 + value: 49.5497 + - type: main_score + value: 46.831 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackWebmastersRetrieval (default) + revision: 160c094312a0e1facb97e55eeddb698c0abe3571 + split: test + type: mteb/cqadupstack-webmasters + metrics: + - type: ndcg_at_1 + value: 34.98 + - type: ndcg_at_3 + value: 39.911 + - type: ndcg_at_5 + value: 42.21 + - type: ndcg_at_10 + value: 45.539 + - type: ndcg_at_20 + value: 47.964 + - type: ndcg_at_100 + value: 51.642999999999994 + - type: ndcg_at_1000 + value: 53.647 + - type: map_at_1 + value: 30.034 + - type: map_at_3 + value: 35.97 + - type: map_at_5 + value: 37.635999999999996 + - type: map_at_10 + value: 39.367999999999995 + - type: map_at_20 + value: 40.328 + - type: map_at_100 + value: 41.158 + - type: map_at_1000 + value: 41.366 + - type: recall_at_1 + value: 30.034 + - type: recall_at_3 + value: 42.006 + - type: recall_at_5 + value: 47.843 + - type: recall_at_10 + value: 57.568 + - type: recall_at_20 + value: 66.493 + - type: recall_at_100 + value: 84.136 + - type: recall_at_1000 + value: 95.631 + - type: precision_at_1 + value: 34.98 + - type: precision_at_3 + value: 18.116 + - type: precision_at_5 + value: 13.202 + - type: precision_at_10 + value: 8.616999999999999 + - type: precision_at_20 + value: 5.425 + - type: precision_at_100 + value: 1.6260000000000001 + - type: precision_at_1000 + value: 0.249 + - type: mrr_at_1 + value: 34.9802 + - type: mrr_at_3 + value: 41.172599999999996 + - type: mrr_at_5 + value: 42.4671 + - type: mrr_at_10 + value: 43.8709 + - type: mrr_at_20 + value: 44.4684 + - type: mrr_at_100 + value: 44.8617 + - type: mrr_at_1000 + value: 44.9033 + - type: nauc_ndcg_at_1_max + value: 36.1514 + - type: nauc_ndcg_at_1_std + value: 6.7383 + - type: nauc_ndcg_at_1_diff1 + value: 49.9936 + - type: nauc_ndcg_at_3_max + value: 38.3225 + - type: nauc_ndcg_at_3_std + value: 8.0985 + - type: nauc_ndcg_at_3_diff1 + value: 42.9416 + - type: nauc_ndcg_at_5_max + value: 39.4299 + - type: nauc_ndcg_at_5_std + value: 9.2335 + - type: nauc_ndcg_at_5_diff1 + value: 43.4214 + - type: nauc_ndcg_at_10_max + value: 39.1123 + - type: nauc_ndcg_at_10_std + value: 9.4134 + - type: nauc_ndcg_at_10_diff1 + value: 42.6415 + - type: nauc_ndcg_at_20_max + value: 38.9531 + - type: nauc_ndcg_at_20_std + value: 9.707 + - type: nauc_ndcg_at_20_diff1 + value: 43.0215 + - type: nauc_ndcg_at_100_max + value: 40.3045 + - type: nauc_ndcg_at_100_std + value: 11.304400000000001 + - type: nauc_ndcg_at_100_diff1 + value: 43.0846 + - type: nauc_ndcg_at_1000_max + value: 39.9421 + - type: nauc_ndcg_at_1000_std + value: 11.1666 + - type: nauc_ndcg_at_1000_diff1 + value: 43.3505 + - type: nauc_map_at_1_max + value: 34.735 + - type: nauc_map_at_1_std + value: 2.9007 + - type: nauc_map_at_1_diff1 + value: 52.495599999999996 + - type: nauc_map_at_3_max + value: 37.5749 + - type: nauc_map_at_3_std + value: 5.1779 + - type: nauc_map_at_3_diff1 + value: 46.536300000000004 + - type: nauc_map_at_5_max + value: 38.4721 + - type: nauc_map_at_5_std + value: 6.0973 + - type: nauc_map_at_5_diff1 + value: 46.434799999999996 + - type: nauc_map_at_10_max + value: 38.744299999999996 + - type: nauc_map_at_10_std + value: 6.7116 + - type: nauc_map_at_10_diff1 + value: 46.0759 + - type: nauc_map_at_20_max + value: 38.756 + - type: nauc_map_at_20_std + value: 7.263699999999999 + - type: nauc_map_at_20_diff1 + value: 46.0274 + - type: nauc_map_at_100_max + value: 38.9362 + - type: nauc_map_at_100_std + value: 8.0227 + - type: nauc_map_at_100_diff1 + value: 45.8767 + - type: nauc_map_at_1000_max + value: 38.7473 + - type: nauc_map_at_1000_std + value: 8.089 + - type: nauc_map_at_1000_diff1 + value: 45.8848 + - type: nauc_recall_at_1_max + value: 34.735 + - type: nauc_recall_at_1_std + value: 2.9007 + - type: nauc_recall_at_1_diff1 + value: 52.495599999999996 + - type: nauc_recall_at_3_max + value: 37.1901 + - type: nauc_recall_at_3_std + value: 6.4211 + - type: nauc_recall_at_3_diff1 + value: 38.846000000000004 + - type: nauc_recall_at_5_max + value: 39.8879 + - type: nauc_recall_at_5_std + value: 9.5204 + - type: nauc_recall_at_5_diff1 + value: 37.9339 + - type: nauc_recall_at_10_max + value: 37.181999999999995 + - type: nauc_recall_at_10_std + value: 9.764100000000001 + - type: nauc_recall_at_10_diff1 + value: 33.4855 + - type: nauc_recall_at_20_max + value: 35.6859 + - type: nauc_recall_at_20_std + value: 13.173599999999999 + - type: nauc_recall_at_20_diff1 + value: 33.254 + - type: nauc_recall_at_100_max + value: 42.728100000000005 + - type: nauc_recall_at_100_std + value: 25.913999999999998 + - type: nauc_recall_at_100_diff1 + value: 28.9205 + - type: nauc_recall_at_1000_max + value: 56.496900000000004 + - type: nauc_recall_at_1000_std + value: 56.183499999999995 + - type: nauc_recall_at_1000_diff1 + value: 24.8659 + - type: nauc_precision_at_1_max + value: 36.1514 + - type: nauc_precision_at_1_std + value: 6.7383 + - type: nauc_precision_at_1_diff1 + value: 49.9936 + - type: nauc_precision_at_3_max + value: 36.5767 + - type: nauc_precision_at_3_std + value: 14.884500000000001 + - type: nauc_precision_at_3_diff1 + value: 26.1181 + - type: nauc_precision_at_5_max + value: 33.7094 + - type: nauc_precision_at_5_std + value: 17.566699999999997 + - type: nauc_precision_at_5_diff1 + value: 20.061799999999998 + - type: nauc_precision_at_10_max + value: 28.034 + - type: nauc_precision_at_10_std + value: 23.1877 + - type: nauc_precision_at_10_diff1 + value: 9.646799999999999 + - type: nauc_precision_at_20_max + value: 17.930699999999998 + - type: nauc_precision_at_20_std + value: 23.0956 + - type: nauc_precision_at_20_diff1 + value: -0.0383 + - type: nauc_precision_at_100_max + value: 0.6149 + - type: nauc_precision_at_100_std + value: 22.7163 + - type: nauc_precision_at_100_diff1 + value: -8.730400000000001 + - type: nauc_precision_at_1000_max + value: -19.8022 + - type: nauc_precision_at_1000_std + value: 8.6017 + - type: nauc_precision_at_1000_diff1 + value: -14.161499999999998 + - type: nauc_mrr_at_1_max + value: 36.1514 + - type: nauc_mrr_at_1_std + value: 6.7383 + - type: nauc_mrr_at_1_diff1 + value: 49.9936 + - type: nauc_mrr_at_3_max + value: 37.894299999999994 + - type: nauc_mrr_at_3_std + value: 8.948599999999999 + - type: nauc_mrr_at_3_diff1 + value: 43.985400000000006 + - type: nauc_mrr_at_5_max + value: 38.8686 + - type: nauc_mrr_at_5_std + value: 9.4464 + - type: nauc_mrr_at_5_diff1 + value: 43.9985 + - type: nauc_mrr_at_10_max + value: 38.419 + - type: nauc_mrr_at_10_std + value: 9.4221 + - type: nauc_mrr_at_10_diff1 + value: 43.621700000000004 + - type: nauc_mrr_at_20_max + value: 38.3933 + - type: nauc_mrr_at_20_std + value: 9.6024 + - type: nauc_mrr_at_20_diff1 + value: 43.8952 + - type: nauc_mrr_at_100_max + value: 38.4371 + - type: nauc_mrr_at_100_std + value: 9.657200000000001 + - type: nauc_mrr_at_100_diff1 + value: 43.9457 + - type: nauc_mrr_at_1000_max + value: 38.4386 + - type: nauc_mrr_at_1000_std + value: 9.6614 + - type: nauc_mrr_at_1000_diff1 + value: 43.9579 + - type: main_score + value: 45.539 + task: + type: Retrieval + - dataset: + config: default + name: MTEB CQADupstackWordpressRetrieval (default) + revision: 4ffe81d471b1924886b33c7567bfb200e9eec5c4 + split: test + type: mteb/cqadupstack-wordpress + metrics: + - type: ndcg_at_1 + value: 26.987 + - type: ndcg_at_3 + value: 33.056999999999995 + - type: ndcg_at_5 + value: 35.356 + - type: ndcg_at_10 + value: 38.440000000000005 + - type: ndcg_at_20 + value: 40.136 + - type: ndcg_at_100 + value: 43.473 + - type: ndcg_at_1000 + value: 45.687 + - type: map_at_1 + value: 24.651999999999997 + - type: map_at_3 + value: 30.416999999999998 + - type: map_at_5 + value: 31.863999999999997 + - type: map_at_10 + value: 33.253 + - type: map_at_20 + value: 33.756 + - type: map_at_100 + value: 34.257 + - type: map_at_1000 + value: 34.347 + - type: recall_at_1 + value: 24.651999999999997 + - type: recall_at_3 + value: 37.88 + - type: recall_at_5 + value: 43.136 + - type: recall_at_10 + value: 52.06699999999999 + - type: recall_at_20 + value: 58.540000000000006 + - type: recall_at_100 + value: 75.22 + - type: recall_at_1000 + value: 91.774 + - type: precision_at_1 + value: 26.987 + - type: precision_at_3 + value: 14.048 + - type: precision_at_5 + value: 9.871 + - type: precision_at_10 + value: 6.063000000000001 + - type: precision_at_20 + value: 3.4099999999999997 + - type: precision_at_100 + value: 0.922 + - type: precision_at_1000 + value: 0.123 + - type: mrr_at_1 + value: 26.9871 + - type: mrr_at_3 + value: 33.1485 + - type: mrr_at_5 + value: 34.3407 + - type: mrr_at_10 + value: 35.6087 + - type: mrr_at_20 + value: 36.0483 + - type: mrr_at_100 + value: 36.463699999999996 + - type: mrr_at_1000 + value: 36.5278 + - type: nauc_ndcg_at_1_max + value: 26.6537 + - type: nauc_ndcg_at_1_std + value: -3.9813 + - type: nauc_ndcg_at_1_diff1 + value: 47.8302 + - type: nauc_ndcg_at_3_max + value: 27.3661 + - type: nauc_ndcg_at_3_std + value: -2.2132 + - type: nauc_ndcg_at_3_diff1 + value: 39.9424 + - type: nauc_ndcg_at_5_max + value: 27.417799999999996 + - type: nauc_ndcg_at_5_std + value: -1.0684 + - type: nauc_ndcg_at_5_diff1 + value: 39.163599999999995 + - type: nauc_ndcg_at_10_max + value: 26.555400000000002 + - type: nauc_ndcg_at_10_std + value: 0.0103 + - type: nauc_ndcg_at_10_diff1 + value: 38.9487 + - type: nauc_ndcg_at_20_max + value: 25.963900000000002 + - type: nauc_ndcg_at_20_std + value: 0.7779 + - type: nauc_ndcg_at_20_diff1 + value: 38.7279 + - type: nauc_ndcg_at_100_max + value: 26.6365 + - type: nauc_ndcg_at_100_std + value: 3.0018 + - type: nauc_ndcg_at_100_diff1 + value: 38.1326 + - type: nauc_ndcg_at_1000_max + value: 26.52 + - type: nauc_ndcg_at_1000_std + value: 2.6968 + - type: nauc_ndcg_at_1000_diff1 + value: 38.1665 + - type: nauc_map_at_1_max + value: 24.950400000000002 + - type: nauc_map_at_1_std + value: -4.2715000000000005 + - type: nauc_map_at_1_diff1 + value: 48.2994 + - type: nauc_map_at_3_max + value: 26.4208 + - type: nauc_map_at_3_std + value: -3.0675 + - type: nauc_map_at_3_diff1 + value: 41.987 + - type: nauc_map_at_5_max + value: 26.641900000000003 + - type: nauc_map_at_5_std + value: -2.3005 + - type: nauc_map_at_5_diff1 + value: 41.4695 + - type: nauc_map_at_10_max + value: 26.2781 + - type: nauc_map_at_10_std + value: -1.8994 + - type: nauc_map_at_10_diff1 + value: 41.193000000000005 + - type: nauc_map_at_20_max + value: 26.0838 + - type: nauc_map_at_20_std + value: -1.7046999999999999 + - type: nauc_map_at_20_diff1 + value: 41.1128 + - type: nauc_map_at_100_max + value: 26.230199999999996 + - type: nauc_map_at_100_std + value: -1.2565 + - type: nauc_map_at_100_diff1 + value: 41.0271 + - type: nauc_map_at_1000_max + value: 26.2069 + - type: nauc_map_at_1000_std + value: -1.2469 + - type: nauc_map_at_1000_diff1 + value: 41.019 + - type: nauc_recall_at_1_max + value: 24.950400000000002 + - type: nauc_recall_at_1_std + value: -4.2715000000000005 + - type: nauc_recall_at_1_diff1 + value: 48.2994 + - type: nauc_recall_at_3_max + value: 27.2098 + - type: nauc_recall_at_3_std + value: -1.309 + - type: nauc_recall_at_3_diff1 + value: 34.4663 + - type: nauc_recall_at_5_max + value: 27.323700000000002 + - type: nauc_recall_at_5_std + value: 1.7010999999999998 + - type: nauc_recall_at_5_diff1 + value: 32.4911 + - type: nauc_recall_at_10_max + value: 24.6483 + - type: nauc_recall_at_10_std + value: 4.9019 + - type: nauc_recall_at_10_diff1 + value: 32.0585 + - type: nauc_recall_at_20_max + value: 22.556 + - type: nauc_recall_at_20_std + value: 8.1527 + - type: nauc_recall_at_20_diff1 + value: 30.8345 + - type: nauc_recall_at_100_max + value: 25.354300000000002 + - type: nauc_recall_at_100_std + value: 22.8578 + - type: nauc_recall_at_100_diff1 + value: 23.291999999999998 + - type: nauc_recall_at_1000_max + value: 26.523999999999997 + - type: nauc_recall_at_1000_std + value: 44.7733 + - type: nauc_recall_at_1000_diff1 + value: 3.1338 + - type: nauc_precision_at_1_max + value: 26.6537 + - type: nauc_precision_at_1_std + value: -3.9813 + - type: nauc_precision_at_1_diff1 + value: 47.8302 + - type: nauc_precision_at_3_max + value: 30.8201 + - type: nauc_precision_at_3_std + value: 1.7691 + - type: nauc_precision_at_3_diff1 + value: 33.3835 + - type: nauc_precision_at_5_max + value: 29.5433 + - type: nauc_precision_at_5_std + value: 4.4224 + - type: nauc_precision_at_5_diff1 + value: 28.426000000000002 + - type: nauc_precision_at_10_max + value: 26.0888 + - type: nauc_precision_at_10_std + value: 7.8104000000000005 + - type: nauc_precision_at_10_diff1 + value: 24.509800000000002 + - type: nauc_precision_at_20_max + value: 22.218799999999998 + - type: nauc_precision_at_20_std + value: 11.248099999999999 + - type: nauc_precision_at_20_diff1 + value: 20.6056 + - type: nauc_precision_at_100_max + value: 16.4622 + - type: nauc_precision_at_100_std + value: 25.735200000000003 + - type: nauc_precision_at_100_diff1 + value: 6.2566 + - type: nauc_precision_at_1000_max + value: -9.109399999999999 + - type: nauc_precision_at_1000_std + value: 13.820099999999998 + - type: nauc_precision_at_1000_diff1 + value: -7.9046 + - type: nauc_mrr_at_1_max + value: 26.6537 + - type: nauc_mrr_at_1_std + value: -3.9813 + - type: nauc_mrr_at_1_diff1 + value: 47.8302 + - type: nauc_mrr_at_3_max + value: 27.9843 + - type: nauc_mrr_at_3_std + value: -2.3418 + - type: nauc_mrr_at_3_diff1 + value: 41.4877 + - type: nauc_mrr_at_5_max + value: 27.9298 + - type: nauc_mrr_at_5_std + value: -1.7860999999999998 + - type: nauc_mrr_at_5_diff1 + value: 40.9261 + - type: nauc_mrr_at_10_max + value: 27.6814 + - type: nauc_mrr_at_10_std + value: -1.1542000000000001 + - type: nauc_mrr_at_10_diff1 + value: 40.9534 + - type: nauc_mrr_at_20_max + value: 27.507900000000003 + - type: nauc_mrr_at_20_std + value: -0.9558000000000001 + - type: nauc_mrr_at_20_diff1 + value: 41.0046 + - type: nauc_mrr_at_100_max + value: 27.5032 + - type: nauc_mrr_at_100_std + value: -0.7483 + - type: nauc_mrr_at_100_diff1 + value: 40.9239 + - type: nauc_mrr_at_1000_max + value: 27.4957 + - type: nauc_mrr_at_1000_std + value: -0.7642 + - type: nauc_mrr_at_1000_diff1 + value: 40.9219 + - type: main_score + value: 38.440000000000005 + task: + type: Retrieval + - dataset: + config: default + name: MTEB ClimateFEVER (default) + revision: 47f2ac6acb640fc46020b02a5b59fdda04d39380 + split: test + type: mteb/climate-fever + metrics: + - type: ndcg_at_1 + value: 47.231 + - type: ndcg_at_3 + value: 38.605000000000004 + - type: ndcg_at_5 + value: 40.058 + - type: ndcg_at_10 + value: 43.482 + - type: ndcg_at_20 + value: 45.732 + - type: ndcg_at_100 + value: 49.062 + - type: ndcg_at_1000 + value: 51.605000000000004 + - type: map_at_1 + value: 20.674 + - type: map_at_3 + value: 29.375 + - type: map_at_5 + value: 31.872 + - type: map_at_10 + value: 33.846 + - type: map_at_20 + value: 34.733000000000004 + - type: map_at_100 + value: 35.411 + - type: map_at_1000 + value: 35.553000000000004 + - type: recall_at_1 + value: 20.674 + - type: recall_at_3 + value: 33.859 + - type: recall_at_5 + value: 39.76 + - type: recall_at_10 + value: 47.150999999999996 + - type: recall_at_20 + value: 53.522999999999996 + - type: recall_at_100 + value: 66.125 + - type: recall_at_1000 + value: 80.368 + - type: precision_at_1 + value: 47.231 + - type: precision_at_3 + value: 28.534 + - type: precision_at_5 + value: 20.782 + - type: precision_at_10 + value: 12.742999999999999 + - type: precision_at_20 + value: 7.342 + - type: precision_at_100 + value: 1.883 + - type: precision_at_1000 + value: 0.23700000000000002 + - type: mrr_at_1 + value: 47.2313 + - type: mrr_at_3 + value: 55.6352 + - type: mrr_at_5 + value: 56.92509999999999 + - type: mrr_at_10 + value: 57.833400000000005 + - type: mrr_at_20 + value: 58.178700000000006 + - type: mrr_at_100 + value: 58.385 + - type: mrr_at_1000 + value: 58.40919999999999 + - type: nauc_ndcg_at_1_max + value: 41.5456 + - type: nauc_ndcg_at_1_std + value: 19.2734 + - type: nauc_ndcg_at_1_diff1 + value: 38.0868 + - type: nauc_ndcg_at_3_max + value: 41.6105 + - type: nauc_ndcg_at_3_std + value: 19.5917 + - type: nauc_ndcg_at_3_diff1 + value: 29.192800000000002 + - type: nauc_ndcg_at_5_max + value: 42.1893 + - type: nauc_ndcg_at_5_std + value: 21.9984 + - type: nauc_ndcg_at_5_diff1 + value: 27.7412 + - type: nauc_ndcg_at_10_max + value: 42.5633 + - type: nauc_ndcg_at_10_std + value: 24.265700000000002 + - type: nauc_ndcg_at_10_diff1 + value: 27.0287 + - type: nauc_ndcg_at_20_max + value: 43.364200000000004 + - type: nauc_ndcg_at_20_std + value: 26.2174 + - type: nauc_ndcg_at_20_diff1 + value: 26.980500000000003 + - type: nauc_ndcg_at_100_max + value: 43.9582 + - type: nauc_ndcg_at_100_std + value: 28.454 + - type: nauc_ndcg_at_100_diff1 + value: 27.087099999999996 + - type: nauc_ndcg_at_1000_max + value: 44.0356 + - type: nauc_ndcg_at_1000_std + value: 28.64 + - type: nauc_ndcg_at_1000_diff1 + value: 27.1343 + - type: nauc_map_at_1_max + value: 39.2181 + - type: nauc_map_at_1_std + value: 12.4972 + - type: nauc_map_at_1_diff1 + value: 39.5664 + - type: nauc_map_at_3_max + value: 41.5441 + - type: nauc_map_at_3_std + value: 17.333000000000002 + - type: nauc_map_at_3_diff1 + value: 29.9555 + - type: nauc_map_at_5_max + value: 41.0041 + - type: nauc_map_at_5_std + value: 19.3667 + - type: nauc_map_at_5_diff1 + value: 28.0157 + - type: nauc_map_at_10_max + value: 41.2914 + - type: nauc_map_at_10_std + value: 21.051000000000002 + - type: nauc_map_at_10_diff1 + value: 27.387 + - type: nauc_map_at_20_max + value: 41.6964 + - type: nauc_map_at_20_std + value: 21.9338 + - type: nauc_map_at_20_diff1 + value: 27.4326 + - type: nauc_map_at_100_max + value: 41.8592 + - type: nauc_map_at_100_std + value: 22.46 + - type: nauc_map_at_100_diff1 + value: 27.4024 + - type: nauc_map_at_1000_max + value: 41.8737 + - type: nauc_map_at_1000_std + value: 22.4882 + - type: nauc_map_at_1000_diff1 + value: 27.405099999999997 + - type: nauc_recall_at_1_max + value: 39.2181 + - type: nauc_recall_at_1_std + value: 12.4972 + - type: nauc_recall_at_1_diff1 + value: 39.5664 + - type: nauc_recall_at_3_max + value: 41.3571 + - type: nauc_recall_at_3_std + value: 18.607699999999998 + - type: nauc_recall_at_3_diff1 + value: 25.8418 + - type: nauc_recall_at_5_max + value: 39.1225 + - type: nauc_recall_at_5_std + value: 22.2091 + - type: nauc_recall_at_5_diff1 + value: 20.9495 + - type: nauc_recall_at_10_max + value: 38.0045 + - type: nauc_recall_at_10_std + value: 25.584 + - type: nauc_recall_at_10_diff1 + value: 18.489 + - type: nauc_recall_at_20_max + value: 38.0096 + - type: nauc_recall_at_20_std + value: 29.3335 + - type: nauc_recall_at_20_diff1 + value: 17.0106 + - type: nauc_recall_at_100_max + value: 37.7378 + - type: nauc_recall_at_100_std + value: 37.0189 + - type: nauc_recall_at_100_diff1 + value: 14.815900000000001 + - type: nauc_recall_at_1000_max + value: 36.2825 + - type: nauc_recall_at_1000_std + value: 42.1995 + - type: nauc_recall_at_1000_diff1 + value: 10.5182 + - type: nauc_precision_at_1_max + value: 41.5456 + - type: nauc_precision_at_1_std + value: 19.2734 + - type: nauc_precision_at_1_diff1 + value: 38.0868 + - type: nauc_precision_at_3_max + value: 35.72 + - type: nauc_precision_at_3_std + value: 22.8785 + - type: nauc_precision_at_3_diff1 + value: 15.240200000000002 + - type: nauc_precision_at_5_max + value: 30.4643 + - type: nauc_precision_at_5_std + value: 26.2774 + - type: nauc_precision_at_5_diff1 + value: 8.8749 + - type: nauc_precision_at_10_max + value: 25.960299999999997 + - type: nauc_precision_at_10_std + value: 28.3825 + - type: nauc_precision_at_10_diff1 + value: 4.626799999999999 + - type: nauc_precision_at_20_max + value: 24.8278 + - type: nauc_precision_at_20_std + value: 32.1644 + - type: nauc_precision_at_20_diff1 + value: 2.5019 + - type: nauc_precision_at_100_max + value: 17.180999999999997 + - type: nauc_precision_at_100_std + value: 33.955400000000004 + - type: nauc_precision_at_100_diff1 + value: -1.9183 + - type: nauc_precision_at_1000_max + value: 4.8986 + - type: nauc_precision_at_1000_std + value: 26.5376 + - type: nauc_precision_at_1000_diff1 + value: -9.3468 + - type: nauc_mrr_at_1_max + value: 41.5456 + - type: nauc_mrr_at_1_std + value: 19.2734 + - type: nauc_mrr_at_1_diff1 + value: 38.0868 + - type: nauc_mrr_at_3_max + value: 43.7301 + - type: nauc_mrr_at_3_std + value: 22.409100000000002 + - type: nauc_mrr_at_3_diff1 + value: 34.846500000000006 + - type: nauc_mrr_at_5_max + value: 44.0608 + - type: nauc_mrr_at_5_std + value: 23.3812 + - type: nauc_mrr_at_5_diff1 + value: 34.5847 + - type: nauc_mrr_at_10_max + value: 44.026700000000005 + - type: nauc_mrr_at_10_std + value: 23.339399999999998 + - type: nauc_mrr_at_10_diff1 + value: 34.7306 + - type: nauc_mrr_at_20_max + value: 44.1444 + - type: nauc_mrr_at_20_std + value: 23.5132 + - type: nauc_mrr_at_20_diff1 + value: 34.6927 + - type: nauc_mrr_at_100_max + value: 44.1228 + - type: nauc_mrr_at_100_std + value: 23.5783 + - type: nauc_mrr_at_100_diff1 + value: 34.7193 + - type: nauc_mrr_at_1000_max + value: 44.1082 + - type: nauc_mrr_at_1000_std + value: 23.5574 + - type: nauc_mrr_at_1000_diff1 + value: 34.719699999999996 + - type: main_score + value: 43.482 + task: + type: Retrieval + - dataset: + config: default + name: MTEB DBPedia (default) + revision: c0f706b76e590d620bd6618b3ca8efdd34e2d659 + split: test + type: mteb/dbpedia + metrics: + - type: ndcg_at_1 + value: 59.25 + - type: ndcg_at_3 + value: 48.256 + - type: ndcg_at_5 + value: 45.580999999999996 + - type: ndcg_at_10 + value: 43.37 + - type: ndcg_at_20 + value: 43.106 + - type: ndcg_at_100 + value: 47.845 + - type: ndcg_at_1000 + value: 54.974999999999994 + - type: map_at_1 + value: 10.032 + - type: map_at_3 + value: 14.954 + - type: map_at_5 + value: 17.408 + - type: map_at_10 + value: 20.461 + - type: map_at_20 + value: 23.759 + - type: map_at_100 + value: 28.718 + - type: map_at_1000 + value: 30.406 + - type: recall_at_1 + value: 10.032 + - type: recall_at_3 + value: 15.905 + - type: recall_at_5 + value: 19.622999999999998 + - type: recall_at_10 + value: 25.125999999999998 + - type: recall_at_20 + value: 33.262 + - type: recall_at_100 + value: 52.515 + - type: recall_at_1000 + value: 75.224 + - type: precision_at_1 + value: 72.0 + - type: precision_at_3 + value: 50.917 + - type: precision_at_5 + value: 43.4 + - type: precision_at_10 + value: 34.175 + - type: precision_at_20 + value: 26.325 + - type: precision_at_100 + value: 10.893 + - type: precision_at_1000 + value: 2.0549999999999997 + - type: mrr_at_1 + value: 72.0 + - type: mrr_at_3 + value: 77.5417 + - type: mrr_at_5 + value: 78.2042 + - type: mrr_at_10 + value: 78.7173 + - type: mrr_at_20 + value: 78.9521 + - type: mrr_at_100 + value: 79.0382 + - type: mrr_at_1000 + value: 79.0408 + - type: nauc_ndcg_at_1_max + value: 49.778 + - type: nauc_ndcg_at_1_std + value: 20.462 + - type: nauc_ndcg_at_1_diff1 + value: 49.3621 + - type: nauc_ndcg_at_3_max + value: 44.4388 + - type: nauc_ndcg_at_3_std + value: 24.646 + - type: nauc_ndcg_at_3_diff1 + value: 33.3173 + - type: nauc_ndcg_at_5_max + value: 44.2179 + - type: nauc_ndcg_at_5_std + value: 25.597399999999997 + - type: nauc_ndcg_at_5_diff1 + value: 31.0886 + - type: nauc_ndcg_at_10_max + value: 43.7812 + - type: nauc_ndcg_at_10_std + value: 25.61 + - type: nauc_ndcg_at_10_diff1 + value: 30.667699999999996 + - type: nauc_ndcg_at_20_max + value: 39.4779 + - type: nauc_ndcg_at_20_std + value: 20.891000000000002 + - type: nauc_ndcg_at_20_diff1 + value: 29.492600000000003 + - type: nauc_ndcg_at_100_max + value: 41.511900000000004 + - type: nauc_ndcg_at_100_std + value: 27.340999999999998 + - type: nauc_ndcg_at_100_diff1 + value: 30.5701 + - type: nauc_ndcg_at_1000_max + value: 47.0571 + - type: nauc_ndcg_at_1000_std + value: 37.0976 + - type: nauc_ndcg_at_1000_diff1 + value: 31.5615 + - type: nauc_map_at_1_max + value: 0.4743 + - type: nauc_map_at_1_std + value: -23.7532 + - type: nauc_map_at_1_diff1 + value: 26.0851 + - type: nauc_map_at_3_max + value: 8.5131 + - type: nauc_map_at_3_std + value: -18.6015 + - type: nauc_map_at_3_diff1 + value: 21.9172 + - type: nauc_map_at_5_max + value: 12.295499999999999 + - type: nauc_map_at_5_std + value: -13.872100000000001 + - type: nauc_map_at_5_diff1 + value: 21.3319 + - type: nauc_map_at_10_max + value: 17.1428 + - type: nauc_map_at_10_std + value: -6.638199999999999 + - type: nauc_map_at_10_diff1 + value: 20.8671 + - type: nauc_map_at_20_max + value: 21.7306 + - type: nauc_map_at_20_std + value: 2.1404 + - type: nauc_map_at_20_diff1 + value: 20.7929 + - type: nauc_map_at_100_max + value: 29.677799999999998 + - type: nauc_map_at_100_std + value: 16.9458 + - type: nauc_map_at_100_diff1 + value: 22.4101 + - type: nauc_map_at_1000_max + value: 31.5735 + - type: nauc_map_at_1000_std + value: 20.5816 + - type: nauc_map_at_1000_diff1 + value: 22.561400000000003 + - type: nauc_recall_at_1_max + value: 0.4743 + - type: nauc_recall_at_1_std + value: -23.7532 + - type: nauc_recall_at_1_diff1 + value: 26.0851 + - type: nauc_recall_at_3_max + value: 6.851500000000001 + - type: nauc_recall_at_3_std + value: -18.7341 + - type: nauc_recall_at_3_diff1 + value: 19.703699999999998 + - type: nauc_recall_at_5_max + value: 10.0265 + - type: nauc_recall_at_5_std + value: -14.2537 + - type: nauc_recall_at_5_diff1 + value: 18.8765 + - type: nauc_recall_at_10_max + value: 14.1582 + - type: nauc_recall_at_10_std + value: -7.703 + - type: nauc_recall_at_10_diff1 + value: 17.9056 + - type: nauc_recall_at_20_max + value: 15.0343 + - type: nauc_recall_at_20_std + value: -0.9846 + - type: nauc_recall_at_20_diff1 + value: 14.377899999999999 + - type: nauc_recall_at_100_max + value: 27.904600000000002 + - type: nauc_recall_at_100_std + value: 24.6322 + - type: nauc_recall_at_100_diff1 + value: 16.869500000000002 + - type: nauc_recall_at_1000_max + value: 33.7755 + - type: nauc_recall_at_1000_std + value: 42.241800000000005 + - type: nauc_recall_at_1000_diff1 + value: 17.3324 + - type: nauc_precision_at_1_max + value: 62.3459 + - type: nauc_precision_at_1_std + value: 28.3277 + - type: nauc_precision_at_1_diff1 + value: 57.8053 + - type: nauc_precision_at_3_max + value: 45.8296 + - type: nauc_precision_at_3_std + value: 39.8642 + - type: nauc_precision_at_3_diff1 + value: 15.7381 + - type: nauc_precision_at_5_max + value: 45.331900000000005 + - type: nauc_precision_at_5_std + value: 45.1279 + - type: nauc_precision_at_5_diff1 + value: 11.473700000000001 + - type: nauc_precision_at_10_max + value: 42.276399999999995 + - type: nauc_precision_at_10_std + value: 50.9538 + - type: nauc_precision_at_10_diff1 + value: 6.708699999999999 + - type: nauc_precision_at_20_max + value: 37.961600000000004 + - type: nauc_precision_at_20_std + value: 52.0611 + - type: nauc_precision_at_20_diff1 + value: 5.9309 + - type: nauc_precision_at_100_max + value: 29.567 + - type: nauc_precision_at_100_std + value: 50.07 + - type: nauc_precision_at_100_diff1 + value: 3.2583 + - type: nauc_precision_at_1000_max + value: 5.5285 + - type: nauc_precision_at_1000_std + value: 20.5813 + - type: nauc_precision_at_1000_diff1 + value: -6.6333 + - type: nauc_mrr_at_1_max + value: 62.3459 + - type: nauc_mrr_at_1_std + value: 28.3277 + - type: nauc_mrr_at_1_diff1 + value: 57.8053 + - type: nauc_mrr_at_3_max + value: 66.5168 + - type: nauc_mrr_at_3_std + value: 37.4446 + - type: nauc_mrr_at_3_diff1 + value: 57.6125 + - type: nauc_mrr_at_5_max + value: 65.8343 + - type: nauc_mrr_at_5_std + value: 36.6396 + - type: nauc_mrr_at_5_diff1 + value: 56.91589999999999 + - type: nauc_mrr_at_10_max + value: 65.73750000000001 + - type: nauc_mrr_at_10_std + value: 36.4067 + - type: nauc_mrr_at_10_diff1 + value: 56.9594 + - type: nauc_mrr_at_20_max + value: 65.6623 + - type: nauc_mrr_at_20_std + value: 36.0989 + - type: nauc_mrr_at_20_diff1 + value: 56.9662 + - type: nauc_mrr_at_100_max + value: 65.6934 + - type: nauc_mrr_at_100_std + value: 36.0911 + - type: nauc_mrr_at_100_diff1 + value: 57.0541 + - type: nauc_mrr_at_1000_max + value: 65.68929999999999 + - type: nauc_mrr_at_1000_std + value: 36.0838 + - type: nauc_mrr_at_1000_diff1 + value: 57.054300000000005 + - type: main_score + value: 43.37 + task: + type: Retrieval + - dataset: + config: default + name: MTEB EmotionClassification (default) + revision: 4f58c6b202a23cf9a4da393831edf4f9183cad37 + split: test + type: mteb/emotion + metrics: + - type: accuracy + value: 42.53 + - type: f1 + value: 38.4608 + - type: f1_weighted + value: 44.6927 + - type: main_score + value: 42.53 + task: + type: Classification + - dataset: + config: default + name: MTEB FEVER (default) + revision: bea83ef9e8fb933d90a2f1d5515737465d613e12 + split: test + type: mteb/fever + metrics: + - type: ndcg_at_1 + value: 90.519 + - type: ndcg_at_3 + value: 91.387 + - type: ndcg_at_5 + value: 91.644 + - type: ndcg_at_10 + value: 91.91 + - type: ndcg_at_20 + value: 92.136 + - type: ndcg_at_100 + value: 92.406 + - type: ndcg_at_1000 + value: 92.62599999999999 + - type: map_at_1 + value: 83.994 + - type: map_at_3 + value: 88.885 + - type: map_at_5 + value: 89.185 + - type: map_at_10 + value: 89.36500000000001 + - type: map_at_20 + value: 89.458 + - type: map_at_100 + value: 89.515 + - type: map_at_1000 + value: 89.52799999999999 + - type: recall_at_1 + value: 83.994 + - type: recall_at_3 + value: 93.145 + - type: recall_at_5 + value: 94.016 + - type: recall_at_10 + value: 94.836 + - type: recall_at_20 + value: 95.56700000000001 + - type: recall_at_100 + value: 96.711 + - type: recall_at_1000 + value: 98.027 + - type: precision_at_1 + value: 90.519 + - type: precision_at_3 + value: 33.922999999999995 + - type: precision_at_5 + value: 20.636 + - type: precision_at_10 + value: 10.474 + - type: precision_at_20 + value: 5.316 + - type: precision_at_100 + value: 1.0919999999999999 + - type: precision_at_1000 + value: 0.11299999999999999 + - type: mrr_at_1 + value: 90.5191 + - type: mrr_at_3 + value: 94.37440000000001 + - type: mrr_at_5 + value: 94.4832 + - type: mrr_at_10 + value: 94.5215 + - type: mrr_at_20 + value: 94.5365 + - type: mrr_at_100 + value: 94.5422 + - type: mrr_at_1000 + value: 94.54249999999999 + - type: nauc_ndcg_at_1_max + value: 22.1341 + - type: nauc_ndcg_at_1_std + value: -11.1273 + - type: nauc_ndcg_at_1_diff1 + value: 81.8507 + - type: nauc_ndcg_at_3_max + value: 16.8937 + - type: nauc_ndcg_at_3_std + value: -7.1829 + - type: nauc_ndcg_at_3_diff1 + value: 43.892199999999995 + - type: nauc_ndcg_at_5_max + value: 17.9177 + - type: nauc_ndcg_at_5_std + value: -5.2 + - type: nauc_ndcg_at_5_diff1 + value: 41.9608 + - type: nauc_ndcg_at_10_max + value: 17.8222 + - type: nauc_ndcg_at_10_std + value: -3.8736 + - type: nauc_ndcg_at_10_diff1 + value: 41.955 + - type: nauc_ndcg_at_20_max + value: 18.467200000000002 + - type: nauc_ndcg_at_20_std + value: -2.7304 + - type: nauc_ndcg_at_20_diff1 + value: 42.950300000000006 + - type: nauc_ndcg_at_100_max + value: 18.5918 + - type: nauc_ndcg_at_100_std + value: -2.874 + - type: nauc_ndcg_at_100_diff1 + value: 44.182 + - type: nauc_ndcg_at_1000_max + value: 18.9498 + - type: nauc_ndcg_at_1000_std + value: -2.8561 + - type: nauc_ndcg_at_1000_diff1 + value: 45.5587 + - type: nauc_map_at_1_max + value: 14.943600000000002 + - type: nauc_map_at_1_std + value: -6.3744 + - type: nauc_map_at_1_diff1 + value: 51.697700000000005 + - type: nauc_map_at_3_max + value: 15.7558 + - type: nauc_map_at_3_std + value: -5.8517 + - type: nauc_map_at_3_diff1 + value: 41.814 + - type: nauc_map_at_5_max + value: 16.6287 + - type: nauc_map_at_5_std + value: -4.9942 + - type: nauc_map_at_5_diff1 + value: 41.605199999999996 + - type: nauc_map_at_10_max + value: 16.8146 + - type: nauc_map_at_10_std + value: -4.4551 + - type: nauc_map_at_10_diff1 + value: 41.9641 + - type: nauc_map_at_20_max + value: 17.0709 + - type: nauc_map_at_20_std + value: -4.1187000000000005 + - type: nauc_map_at_20_diff1 + value: 42.3292 + - type: nauc_map_at_100_max + value: 17.1076 + - type: nauc_map_at_100_std + value: -4.1089 + - type: nauc_map_at_100_diff1 + value: 42.5101 + - type: nauc_map_at_1000_max + value: 17.1309 + - type: nauc_map_at_1000_std + value: -4.0958000000000006 + - type: nauc_map_at_1000_diff1 + value: 42.5694 + - type: nauc_recall_at_1_max + value: 14.943600000000002 + - type: nauc_recall_at_1_std + value: -6.3744 + - type: nauc_recall_at_1_diff1 + value: 51.697700000000005 + - type: nauc_recall_at_3_max + value: 11.8984 + - type: nauc_recall_at_3_std + value: -4.224 + - type: nauc_recall_at_3_diff1 + value: 13.962 + - type: nauc_recall_at_5_max + value: 16.2434 + - type: nauc_recall_at_5_std + value: 1.6707 + - type: nauc_recall_at_5_diff1 + value: 7.788 + - type: nauc_recall_at_10_max + value: 16.4427 + - type: nauc_recall_at_10_std + value: 8.259 + - type: nauc_recall_at_10_diff1 + value: 4.5507 + - type: nauc_recall_at_20_max + value: 19.0546 + - type: nauc_recall_at_20_std + value: 16.7132 + - type: nauc_recall_at_20_diff1 + value: 3.5242000000000004 + - type: nauc_recall_at_100_max + value: 19.6815 + - type: nauc_recall_at_100_std + value: 21.4767 + - type: nauc_recall_at_100_diff1 + value: 1.4785 + - type: nauc_recall_at_1000_max + value: 26.5748 + - type: nauc_recall_at_1000_std + value: 37.026399999999995 + - type: nauc_recall_at_1000_diff1 + value: 1.512 + - type: nauc_precision_at_1_max + value: 22.1341 + - type: nauc_precision_at_1_std + value: -11.1273 + - type: nauc_precision_at_1_diff1 + value: 81.8507 + - type: nauc_precision_at_3_max + value: 13.6152 + - type: nauc_precision_at_3_std + value: -2.4367 + - type: nauc_precision_at_3_diff1 + value: 1.6237000000000001 + - type: nauc_precision_at_5_max + value: 13.977400000000001 + - type: nauc_precision_at_5_std + value: 4.3391 + - type: nauc_precision_at_5_diff1 + value: -6.660000000000001 + - type: nauc_precision_at_10_max + value: 10.4986 + - type: nauc_precision_at_10_std + value: 8.9132 + - type: nauc_precision_at_10_diff1 + value: -7.5682 + - type: nauc_precision_at_20_max + value: 11.0525 + - type: nauc_precision_at_20_std + value: 12.0579 + - type: nauc_precision_at_20_diff1 + value: -5.0471 + - type: nauc_precision_at_100_max + value: 7.1659 + - type: nauc_precision_at_100_std + value: 8.1754 + - type: nauc_precision_at_100_diff1 + value: -2.7885 + - type: nauc_precision_at_1000_max + value: 4.9776 + - type: nauc_precision_at_1000_std + value: 5.8301 + - type: nauc_precision_at_1000_diff1 + value: 0.18860000000000002 + - type: nauc_mrr_at_1_max + value: 22.1341 + - type: nauc_mrr_at_1_std + value: -11.1273 + - type: nauc_mrr_at_1_diff1 + value: 81.8507 + - type: nauc_mrr_at_3_max + value: 21.6738 + - type: nauc_mrr_at_3_std + value: -15.7016 + - type: nauc_mrr_at_3_diff1 + value: 81.0757 + - type: nauc_mrr_at_5_max + value: 22.6603 + - type: nauc_mrr_at_5_std + value: -14.7345 + - type: nauc_mrr_at_5_diff1 + value: 81.1092 + - type: nauc_mrr_at_10_max + value: 22.4279 + - type: nauc_mrr_at_10_std + value: -14.5002 + - type: nauc_mrr_at_10_diff1 + value: 81.11080000000001 + - type: nauc_mrr_at_20_max + value: 22.3604 + - type: nauc_mrr_at_20_std + value: -14.3058 + - type: nauc_mrr_at_20_diff1 + value: 81.1563 + - type: nauc_mrr_at_100_max + value: 22.311 + - type: nauc_mrr_at_100_std + value: -14.318100000000001 + - type: nauc_mrr_at_100_diff1 + value: 81.1586 + - type: nauc_mrr_at_1000_max + value: 22.307199999999998 + - type: nauc_mrr_at_1000_std + value: -14.3234 + - type: nauc_mrr_at_1000_diff1 + value: 81.1576 + - type: main_score + value: 91.91 + task: + type: Retrieval + - dataset: + config: default + name: MTEB FiQA2018 (default) + revision: 27a168819829fe9bcd655c2df245fb19452e8e06 + split: test + type: mteb/fiqa + metrics: + - type: ndcg_at_1 + value: 44.753 + - type: ndcg_at_3 + value: 41.555 + - type: ndcg_at_5 + value: 42.809999999999995 + - type: ndcg_at_10 + value: 45.49 + - type: ndcg_at_20 + value: 48.287 + - type: ndcg_at_100 + value: 52.115 + - type: ndcg_at_1000 + value: 54.797 + - type: map_at_1 + value: 22.894000000000002 + - type: map_at_3 + value: 32.786 + - type: map_at_5 + value: 35.495 + - type: map_at_10 + value: 37.635000000000005 + - type: map_at_20 + value: 38.771 + - type: map_at_100 + value: 39.56 + - type: map_at_1000 + value: 39.734 + - type: recall_at_1 + value: 22.894000000000002 + - type: recall_at_3 + value: 37.579 + - type: recall_at_5 + value: 44.03 + - type: recall_at_10 + value: 52.61900000000001 + - type: recall_at_20 + value: 61.227 + - type: recall_at_100 + value: 76.88199999999999 + - type: recall_at_1000 + value: 92.534 + - type: precision_at_1 + value: 44.753 + - type: precision_at_3 + value: 27.675 + - type: precision_at_5 + value: 20.556 + - type: precision_at_10 + value: 12.592999999999998 + - type: precision_at_20 + value: 7.507999999999999 + - type: precision_at_100 + value: 1.9369999999999998 + - type: precision_at_1000 + value: 0.242 + - type: mrr_at_1 + value: 44.7531 + - type: mrr_at_3 + value: 50.694399999999995 + - type: mrr_at_5 + value: 51.990700000000004 + - type: mrr_at_10 + value: 52.9925 + - type: mrr_at_20 + value: 53.4612 + - type: mrr_at_100 + value: 53.7889 + - type: mrr_at_1000 + value: 53.8244 + - type: nauc_ndcg_at_1_max + value: 46.679700000000004 + - type: nauc_ndcg_at_1_std + value: -7.8208 + - type: nauc_ndcg_at_1_diff1 + value: 55.9238 + - type: nauc_ndcg_at_3_max + value: 39.761 + - type: nauc_ndcg_at_3_std + value: -7.6645 + - type: nauc_ndcg_at_3_diff1 + value: 43.6641 + - type: nauc_ndcg_at_5_max + value: 37.2506 + - type: nauc_ndcg_at_5_std + value: -7.574300000000001 + - type: nauc_ndcg_at_5_diff1 + value: 41.6025 + - type: nauc_ndcg_at_10_max + value: 38.1464 + - type: nauc_ndcg_at_10_std + value: -6.1288 + - type: nauc_ndcg_at_10_diff1 + value: 42.625 + - type: nauc_ndcg_at_20_max + value: 39.687 + - type: nauc_ndcg_at_20_std + value: -4.6046 + - type: nauc_ndcg_at_20_diff1 + value: 43.2796 + - type: nauc_ndcg_at_100_max + value: 41.4101 + - type: nauc_ndcg_at_100_std + value: -2.1537 + - type: nauc_ndcg_at_100_diff1 + value: 43.980599999999995 + - type: nauc_ndcg_at_1000_max + value: 42.0853 + - type: nauc_ndcg_at_1000_std + value: -2.5 + - type: nauc_ndcg_at_1000_diff1 + value: 44.5636 + - type: nauc_map_at_1_max + value: 21.019299999999998 + - type: nauc_map_at_1_std + value: -10.8832 + - type: nauc_map_at_1_diff1 + value: 45.1685 + - type: nauc_map_at_3_max + value: 29.0524 + - type: nauc_map_at_3_std + value: -9.6495 + - type: nauc_map_at_3_diff1 + value: 41.3844 + - type: nauc_map_at_5_max + value: 31.3813 + - type: nauc_map_at_5_std + value: -8.7888 + - type: nauc_map_at_5_diff1 + value: 40.1699 + - type: nauc_map_at_10_max + value: 33.8361 + - type: nauc_map_at_10_std + value: -7.9594 + - type: nauc_map_at_10_diff1 + value: 40.788999999999994 + - type: nauc_map_at_20_max + value: 34.9439 + - type: nauc_map_at_20_std + value: -7.382700000000001 + - type: nauc_map_at_20_diff1 + value: 41.134100000000004 + - type: nauc_map_at_100_max + value: 35.530899999999995 + - type: nauc_map_at_100_std + value: -6.8411 + - type: nauc_map_at_100_diff1 + value: 41.316 + - type: nauc_map_at_1000_max + value: 35.6246 + - type: nauc_map_at_1000_std + value: -6.828399999999999 + - type: nauc_map_at_1000_diff1 + value: 41.3739 + - type: nauc_recall_at_1_max + value: 21.019299999999998 + - type: nauc_recall_at_1_std + value: -10.8832 + - type: nauc_recall_at_1_diff1 + value: 45.1685 + - type: nauc_recall_at_3_max + value: 25.667499999999997 + - type: nauc_recall_at_3_std + value: -9.3695 + - type: nauc_recall_at_3_diff1 + value: 35.0424 + - type: nauc_recall_at_5_max + value: 26.2285 + - type: nauc_recall_at_5_std + value: -7.6552 + - type: nauc_recall_at_5_diff1 + value: 31.7068 + - type: nauc_recall_at_10_max + value: 29.12 + - type: nauc_recall_at_10_std + value: -3.5869 + - type: nauc_recall_at_10_diff1 + value: 31.952599999999997 + - type: nauc_recall_at_20_max + value: 31.5269 + - type: nauc_recall_at_20_std + value: 2.2824 + - type: nauc_recall_at_20_diff1 + value: 31.4747 + - type: nauc_recall_at_100_max + value: 34.533500000000004 + - type: nauc_recall_at_100_std + value: 18.8398 + - type: nauc_recall_at_100_diff1 + value: 29.525000000000002 + - type: nauc_recall_at_1000_max + value: 38.973600000000005 + - type: nauc_recall_at_1000_std + value: 37.9643 + - type: nauc_recall_at_1000_diff1 + value: 29.247899999999998 + - type: nauc_precision_at_1_max + value: 46.679700000000004 + - type: nauc_precision_at_1_std + value: -7.8208 + - type: nauc_precision_at_1_diff1 + value: 55.9238 + - type: nauc_precision_at_3_max + value: 46.348800000000004 + - type: nauc_precision_at_3_std + value: -2.4303000000000003 + - type: nauc_precision_at_3_diff1 + value: 31.4803 + - type: nauc_precision_at_5_max + value: 45.657 + - type: nauc_precision_at_5_std + value: 0.9887999999999999 + - type: nauc_precision_at_5_diff1 + value: 22.6439 + - type: nauc_precision_at_10_max + value: 48.147099999999995 + - type: nauc_precision_at_10_std + value: 5.313 + - type: nauc_precision_at_10_diff1 + value: 20.7803 + - type: nauc_precision_at_20_max + value: 47.407199999999996 + - type: nauc_precision_at_20_std + value: 8.8254 + - type: nauc_precision_at_20_diff1 + value: 17.7327 + - type: nauc_precision_at_100_max + value: 43.4944 + - type: nauc_precision_at_100_std + value: 14.8423 + - type: nauc_precision_at_100_diff1 + value: 11.7231 + - type: nauc_precision_at_1000_max + value: 36.3175 + - type: nauc_precision_at_1000_std + value: 14.9478 + - type: nauc_precision_at_1000_diff1 + value: 4.9391 + - type: nauc_mrr_at_1_max + value: 46.679700000000004 + - type: nauc_mrr_at_1_std + value: -7.8208 + - type: nauc_mrr_at_1_diff1 + value: 55.9238 + - type: nauc_mrr_at_3_max + value: 48.0241 + - type: nauc_mrr_at_3_std + value: -6.761100000000001 + - type: nauc_mrr_at_3_diff1 + value: 53.5091 + - type: nauc_mrr_at_5_max + value: 48.0965 + - type: nauc_mrr_at_5_std + value: -6.3173 + - type: nauc_mrr_at_5_diff1 + value: 52.9184 + - type: nauc_mrr_at_10_max + value: 48.3523 + - type: nauc_mrr_at_10_std + value: -5.6531 + - type: nauc_mrr_at_10_diff1 + value: 53.209399999999995 + - type: nauc_mrr_at_20_max + value: 48.365700000000004 + - type: nauc_mrr_at_20_std + value: -5.4359 + - type: nauc_mrr_at_20_diff1 + value: 53.16760000000001 + - type: nauc_mrr_at_100_max + value: 48.351699999999994 + - type: nauc_mrr_at_100_std + value: -5.3941 + - type: nauc_mrr_at_100_diff1 + value: 53.2419 + - type: nauc_mrr_at_1000_max + value: 48.343399999999995 + - type: nauc_mrr_at_1000_std + value: -5.4193 + - type: nauc_mrr_at_1000_diff1 + value: 53.264500000000005 + - type: main_score + value: 45.49 + task: + type: Retrieval + - dataset: + config: default + name: MTEB HotpotQA (default) + revision: ab518f4d6fcca38d87c25209f94beba119d02014 + split: test + type: mteb/hotpotqa + metrics: + - type: ndcg_at_1 + value: 86.536 + - type: ndcg_at_3 + value: 64.485 + - type: ndcg_at_5 + value: 66.513 + - type: ndcg_at_10 + value: 68.151 + - type: ndcg_at_20 + value: 69.145 + - type: ndcg_at_100 + value: 70.552 + - type: ndcg_at_1000 + value: 71.772 + - type: map_at_1 + value: 43.268 + - type: map_at_3 + value: 56.013999999999996 + - type: map_at_5 + value: 57.69 + - type: map_at_10 + value: 58.709 + - type: map_at_20 + value: 59.122 + - type: map_at_100 + value: 59.418000000000006 + - type: map_at_1000 + value: 59.480999999999995 + - type: recall_at_1 + value: 43.268 + - type: recall_at_3 + value: 58.831999999999994 + - type: recall_at_5 + value: 62.829 + - type: recall_at_10 + value: 66.94099999999999 + - type: recall_at_20 + value: 70.135 + - type: recall_at_100 + value: 76.34 + - type: recall_at_1000 + value: 84.443 + - type: precision_at_1 + value: 86.536 + - type: precision_at_3 + value: 39.221000000000004 + - type: precision_at_5 + value: 25.131999999999998 + - type: precision_at_10 + value: 13.388 + - type: precision_at_20 + value: 7.013999999999999 + - type: precision_at_100 + value: 1.5270000000000001 + - type: precision_at_1000 + value: 0.169 + - type: mrr_at_1 + value: 86.5361 + - type: mrr_at_3 + value: 89.6151 + - type: mrr_at_5 + value: 89.9521 + - type: mrr_at_10 + value: 90.1301 + - type: mrr_at_20 + value: 90.201 + - type: mrr_at_100 + value: 90.2397 + - type: mrr_at_1000 + value: 90.245 + - type: nauc_ndcg_at_1_max + value: 57.6156 + - type: nauc_ndcg_at_1_std + value: -3.39 + - type: nauc_ndcg_at_1_diff1 + value: 83.0288 + - type: nauc_ndcg_at_3_max + value: 17.758599999999998 + - type: nauc_ndcg_at_3_std + value: 3.3521 + - type: nauc_ndcg_at_3_diff1 + value: 15.4846 + - type: nauc_ndcg_at_5_max + value: 14.6571 + - type: nauc_ndcg_at_5_std + value: 4.2071 + - type: nauc_ndcg_at_5_diff1 + value: 12.3942 + - type: nauc_ndcg_at_10_max + value: 12.5579 + - type: nauc_ndcg_at_10_std + value: 4.7895 + - type: nauc_ndcg_at_10_diff1 + value: 10.2189 + - type: nauc_ndcg_at_20_max + value: 11.5413 + - type: nauc_ndcg_at_20_std + value: 5.0043 + - type: nauc_ndcg_at_20_diff1 + value: 9.3896 + - type: nauc_ndcg_at_100_max + value: 10.6797 + - type: nauc_ndcg_at_100_std + value: 5.7805 + - type: nauc_ndcg_at_100_diff1 + value: 8.5649 + - type: nauc_ndcg_at_1000_max + value: 10.8847 + - type: nauc_ndcg_at_1000_std + value: 6.1945 + - type: nauc_ndcg_at_1000_diff1 + value: 8.539 + - type: nauc_map_at_1_max + value: 57.6156 + - type: nauc_map_at_1_std + value: -3.39 + - type: nauc_map_at_1_diff1 + value: 83.0288 + - type: nauc_map_at_3_max + value: 12.4083 + - type: nauc_map_at_3_std + value: 3.2297 + - type: nauc_map_at_3_diff1 + value: 8.2482 + - type: nauc_map_at_5_max + value: 10.4054 + - type: nauc_map_at_5_std + value: 3.7108000000000003 + - type: nauc_map_at_5_diff1 + value: 6.4539 + - type: nauc_map_at_10_max + value: 9.439300000000001 + - type: nauc_map_at_10_std + value: 4.0356000000000005 + - type: nauc_map_at_10_diff1 + value: 5.502400000000001 + - type: nauc_map_at_20_max + value: 9.141 + - type: nauc_map_at_20_std + value: 4.1145000000000005 + - type: nauc_map_at_20_diff1 + value: 5.2942 + - type: nauc_map_at_100_max + value: 9.0071 + - type: nauc_map_at_100_std + value: 4.2345 + - type: nauc_map_at_100_diff1 + value: 5.1606 + - type: nauc_map_at_1000_max + value: 9.017999999999999 + - type: nauc_map_at_1000_std + value: 4.2501 + - type: nauc_map_at_1000_diff1 + value: 5.162 + - type: nauc_recall_at_1_max + value: 57.6156 + - type: nauc_recall_at_1_std + value: -3.39 + - type: nauc_recall_at_1_diff1 + value: 83.0288 + - type: nauc_recall_at_3_max + value: 8.4358 + - type: nauc_recall_at_3_std + value: 4.925199999999999 + - type: nauc_recall_at_3_diff1 + value: 0.29009999999999997 + - type: nauc_recall_at_5_max + value: 3.2076000000000002 + - type: nauc_recall_at_5_std + value: 6.2316 + - type: nauc_recall_at_5_diff1 + value: -4.6014 + - type: nauc_recall_at_10_max + value: -1.7786 + - type: nauc_recall_at_10_std + value: 7.467300000000001 + - type: nauc_recall_at_10_diff1 + value: -9.6991 + - type: nauc_recall_at_20_max + value: -5.0717 + - type: nauc_recall_at_20_std + value: 8.1128 + - type: nauc_recall_at_20_diff1 + value: -12.5945 + - type: nauc_recall_at_100_max + value: -10.5434 + - type: nauc_recall_at_100_std + value: 11.7719 + - type: nauc_recall_at_100_diff1 + value: -18.394 + - type: nauc_recall_at_1000_max + value: -15.5908 + - type: nauc_recall_at_1000_std + value: 16.842399999999998 + - type: nauc_recall_at_1000_diff1 + value: -27.099400000000003 + - type: nauc_precision_at_1_max + value: 57.6156 + - type: nauc_precision_at_1_std + value: -3.39 + - type: nauc_precision_at_1_diff1 + value: 83.0288 + - type: nauc_precision_at_3_max + value: 8.4358 + - type: nauc_precision_at_3_std + value: 4.925199999999999 + - type: nauc_precision_at_3_diff1 + value: 0.29009999999999997 + - type: nauc_precision_at_5_max + value: 3.2076000000000002 + - type: nauc_precision_at_5_std + value: 6.2316 + - type: nauc_precision_at_5_diff1 + value: -4.6014 + - type: nauc_precision_at_10_max + value: -1.7786 + - type: nauc_precision_at_10_std + value: 7.467300000000001 + - type: nauc_precision_at_10_diff1 + value: -9.6991 + - type: nauc_precision_at_20_max + value: -5.0717 + - type: nauc_precision_at_20_std + value: 8.1128 + - type: nauc_precision_at_20_diff1 + value: -12.5945 + - type: nauc_precision_at_100_max + value: -10.5434 + - type: nauc_precision_at_100_std + value: 11.7719 + - type: nauc_precision_at_100_diff1 + value: -18.394 + - type: nauc_precision_at_1000_max + value: -15.5908 + - type: nauc_precision_at_1000_std + value: 16.842399999999998 + - type: nauc_precision_at_1000_diff1 + value: -27.099400000000003 + - type: nauc_mrr_at_1_max + value: 57.6156 + - type: nauc_mrr_at_1_std + value: -3.39 + - type: nauc_mrr_at_1_diff1 + value: 83.0288 + - type: nauc_mrr_at_3_max + value: 62.074 + - type: nauc_mrr_at_3_std + value: -0.45199999999999996 + - type: nauc_mrr_at_3_diff1 + value: 82.8025 + - type: nauc_mrr_at_5_max + value: 62.157300000000006 + - type: nauc_mrr_at_5_std + value: 0.2829 + - type: nauc_mrr_at_5_diff1 + value: 82.9913 + - type: nauc_mrr_at_10_max + value: 61.9838 + - type: nauc_mrr_at_10_std + value: 0.16670000000000001 + - type: nauc_mrr_at_10_diff1 + value: 82.9452 + - type: nauc_mrr_at_20_max + value: 61.9516 + - type: nauc_mrr_at_20_std + value: 0.18159999999999998 + - type: nauc_mrr_at_20_diff1 + value: 82.9723 + - type: nauc_mrr_at_100_max + value: 61.891600000000004 + - type: nauc_mrr_at_100_std + value: 0.1432 + - type: nauc_mrr_at_100_diff1 + value: 82.97489999999999 + - type: nauc_mrr_at_1000_max + value: 61.88249999999999 + - type: nauc_mrr_at_1000_std + value: 0.1357 + - type: nauc_mrr_at_1000_diff1 + value: 82.9723 + - type: main_score + value: 68.151 + task: + type: Retrieval + - dataset: + config: default + name: MTEB ImdbClassification (default) + revision: 3d86128a09e091d6018b6d26cad27f2739fc2db7 + split: test + type: mteb/imdb + metrics: + - type: accuracy + value: 72.5444 + - type: f1 + value: 72.4069 + - type: f1_weighted + value: 72.4069 + - type: ap + value: 66.8419 + - type: ap_weighted + value: 66.8419 + - type: main_score + value: 72.5444 + task: + type: Classification + - dataset: + config: default + name: MTEB MSMARCO (default) + revision: c5a29a104738b98a9e76336939199e264163d4a0 + split: dev + type: mteb/msmarco + metrics: + - type: ndcg_at_1 + value: 25.516 + - type: ndcg_at_3 + value: 36.687999999999995 + - type: ndcg_at_5 + value: 40.864 + - type: ndcg_at_10 + value: 44.856 + - type: ndcg_at_20 + value: 47.3 + - type: ndcg_at_100 + value: 50.062 + - type: ndcg_at_1000 + value: 51.085 + - type: map_at_1 + value: 24.782 + - type: map_at_3 + value: 33.668 + - type: map_at_5 + value: 36.010999999999996 + - type: map_at_10 + value: 37.702000000000005 + - type: map_at_20 + value: 38.391 + - type: map_at_100 + value: 38.798 + - type: map_at_1000 + value: 38.841 + - type: recall_at_1 + value: 24.782 + - type: recall_at_3 + value: 44.722 + - type: recall_at_5 + value: 54.769999999999996 + - type: recall_at_10 + value: 66.842 + - type: recall_at_20 + value: 76.319 + - type: recall_at_100 + value: 90.761 + - type: recall_at_1000 + value: 98.48 + - type: precision_at_1 + value: 25.516 + - type: precision_at_3 + value: 15.506 + - type: precision_at_5 + value: 11.413 + - type: precision_at_10 + value: 6.99 + - type: precision_at_20 + value: 4.009 + - type: precision_at_100 + value: 0.959 + - type: precision_at_1000 + value: 0.105 + - type: mrr_at_1 + value: 25.5014 + - type: mrr_at_3 + value: 34.3553 + - type: mrr_at_5 + value: 36.666199999999996 + - type: mrr_at_10 + value: 38.3084 + - type: mrr_at_20 + value: 38.9663 + - type: mrr_at_100 + value: 39.341300000000004 + - type: mrr_at_1000 + value: 39.3785 + - type: nauc_ndcg_at_1_max + value: 4.2138 + - type: nauc_ndcg_at_1_std + value: -24.7801 + - type: nauc_ndcg_at_1_diff1 + value: 37.758399999999995 + - type: nauc_ndcg_at_3_max + value: 5.2536 + - type: nauc_ndcg_at_3_std + value: -29.642200000000003 + - type: nauc_ndcg_at_3_diff1 + value: 32.1639 + - type: nauc_ndcg_at_5_max + value: 5.0839 + - type: nauc_ndcg_at_5_std + value: -31.3077 + - type: nauc_ndcg_at_5_diff1 + value: 31.5135 + - type: nauc_ndcg_at_10_max + value: 6.2542 + - type: nauc_ndcg_at_10_std + value: -30.8439 + - type: nauc_ndcg_at_10_diff1 + value: 31.461299999999998 + - type: nauc_ndcg_at_20_max + value: 6.5669 + - type: nauc_ndcg_at_20_std + value: -29.6288 + - type: nauc_ndcg_at_20_diff1 + value: 31.590200000000003 + - type: nauc_ndcg_at_100_max + value: 6.691800000000001 + - type: nauc_ndcg_at_100_std + value: -28.1768 + - type: nauc_ndcg_at_100_diff1 + value: 32.1699 + - type: nauc_ndcg_at_1000_max + value: 6.451700000000001 + - type: nauc_ndcg_at_1000_std + value: -28.2093 + - type: nauc_ndcg_at_1000_diff1 + value: 32.3573 + - type: nauc_map_at_1_max + value: 4.1941 + - type: nauc_map_at_1_std + value: -24.9531 + - type: nauc_map_at_1_diff1 + value: 38.099 + - type: nauc_map_at_3_max + value: 4.9883999999999995 + - type: nauc_map_at_3_std + value: -28.7062 + - type: nauc_map_at_3_diff1 + value: 33.5696 + - type: nauc_map_at_5_max + value: 4.8525 + - type: nauc_map_at_5_std + value: -29.6601 + - type: nauc_map_at_5_diff1 + value: 33.2144 + - type: nauc_map_at_10_max + value: 5.3533 + - type: nauc_map_at_10_std + value: -29.4529 + - type: nauc_map_at_10_diff1 + value: 33.219300000000004 + - type: nauc_map_at_20_max + value: 5.416300000000001 + - type: nauc_map_at_20_std + value: -29.1294 + - type: nauc_map_at_20_diff1 + value: 33.2747 + - type: nauc_map_at_100_max + value: 5.4547 + - type: nauc_map_at_100_std + value: -28.8978 + - type: nauc_map_at_100_diff1 + value: 33.3505 + - type: nauc_map_at_1000_max + value: 5.4512 + - type: nauc_map_at_1000_std + value: -28.8844 + - type: nauc_map_at_1000_diff1 + value: 33.356700000000004 + - type: nauc_recall_at_1_max + value: 4.1941 + - type: nauc_recall_at_1_std + value: -24.9531 + - type: nauc_recall_at_1_diff1 + value: 38.099 + - type: nauc_recall_at_3_max + value: 5.884799999999999 + - type: nauc_recall_at_3_std + value: -32.317 + - type: nauc_recall_at_3_diff1 + value: 28.284399999999998 + - type: nauc_recall_at_5_max + value: 5.4525 + - type: nauc_recall_at_5_std + value: -36.4055 + - type: nauc_recall_at_5_diff1 + value: 26.384200000000003 + - type: nauc_recall_at_10_max + value: 9.403400000000001 + - type: nauc_recall_at_10_std + value: -35.9112 + - type: nauc_recall_at_10_diff1 + value: 25.2415 + - type: nauc_recall_at_20_max + value: 12.0952 + - type: nauc_recall_at_20_std + value: -30.778299999999998 + - type: nauc_recall_at_20_diff1 + value: 24.1866 + - type: nauc_recall_at_100_max + value: 19.6413 + - type: nauc_recall_at_100_std + value: -11.9243 + - type: nauc_recall_at_100_diff1 + value: 24.6153 + - type: nauc_recall_at_1000_max + value: 48.1206 + - type: nauc_recall_at_1000_std + value: 48.0062 + - type: nauc_recall_at_1000_diff1 + value: 16.2543 + - type: nauc_precision_at_1_max + value: 4.2138 + - type: nauc_precision_at_1_std + value: -24.7801 + - type: nauc_precision_at_1_diff1 + value: 37.758399999999995 + - type: nauc_precision_at_3_max + value: 5.7985 + - type: nauc_precision_at_3_std + value: -31.749899999999997 + - type: nauc_precision_at_3_diff1 + value: 27.373399999999997 + - type: nauc_precision_at_5_max + value: 5.390000000000001 + - type: nauc_precision_at_5_std + value: -35.0586 + - type: nauc_precision_at_5_diff1 + value: 25.100099999999998 + - type: nauc_precision_at_10_max + value: 9.248199999999999 + - type: nauc_precision_at_10_std + value: -32.244299999999996 + - type: nauc_precision_at_10_diff1 + value: 22.5684 + - type: nauc_precision_at_20_max + value: 11.495099999999999 + - type: nauc_precision_at_20_std + value: -24.226300000000002 + - type: nauc_precision_at_20_diff1 + value: 19.6528 + - type: nauc_precision_at_100_max + value: 14.3649 + - type: nauc_precision_at_100_std + value: 0.0593 + - type: nauc_precision_at_100_diff1 + value: 10.9596 + - type: nauc_precision_at_1000_max + value: 10.9512 + - type: nauc_precision_at_1000_std + value: 18.288 + - type: nauc_precision_at_1000_diff1 + value: -3.5423000000000004 + - type: nauc_mrr_at_1_max + value: 4.2204 + - type: nauc_mrr_at_1_std + value: -24.7703 + - type: nauc_mrr_at_1_diff1 + value: 37.8126 + - type: nauc_mrr_at_3_max + value: 5.0668 + - type: nauc_mrr_at_3_std + value: -28.2677 + - type: nauc_mrr_at_3_diff1 + value: 33.3724 + - type: nauc_mrr_at_5_max + value: 5.0481 + - type: nauc_mrr_at_5_std + value: -29.133 + - type: nauc_mrr_at_5_diff1 + value: 33.0415 + - type: nauc_mrr_at_10_max + value: 5.5038 + - type: nauc_mrr_at_10_std + value: -28.886200000000002 + - type: nauc_mrr_at_10_diff1 + value: 33.0593 + - type: nauc_mrr_at_20_max + value: 5.5467 + - type: nauc_mrr_at_20_std + value: -28.5678 + - type: nauc_mrr_at_20_diff1 + value: 33.0916 + - type: nauc_mrr_at_100_max + value: 5.5636 + - type: nauc_mrr_at_100_std + value: -28.3877 + - type: nauc_mrr_at_100_diff1 + value: 33.1799 + - type: nauc_mrr_at_1000_max + value: 5.557 + - type: nauc_mrr_at_1000_std + value: -28.3796 + - type: nauc_mrr_at_1000_diff1 + value: 33.184999999999995 + - type: main_score + value: 44.856 + task: + type: Retrieval + - dataset: + config: en + name: MTEB MTOPDomainClassification (en) + revision: d80d48c1eb48d3562165c59d59d0034df9fff0bf + split: test + type: mteb/mtop_domain + metrics: + - type: accuracy + value: 93.5317 + - type: f1 + value: 93.1956 + - type: f1_weighted + value: 93.5431 + - type: main_score + value: 93.5317 + task: + type: Classification + - dataset: + config: en + name: MTEB MTOPIntentClassification (en) + revision: ae001d0e6b1228650b7bd1c2c65fb50ad11a8aba + split: test + type: mteb/mtop_intent + metrics: + - type: accuracy + value: 67.7907 + - type: f1 + value: 48.2877 + - type: f1_weighted + value: 70.3225 + - type: main_score + value: 67.7907 + task: + type: Classification + - dataset: + config: en + name: MTEB MassiveIntentClassification (en) + revision: 4672e20407010da34463acc759c162ca9734bca6 + split: test + type: mteb/amazon_massive_intent + metrics: + - type: accuracy + value: 71.456 + - type: f1 + value: 68.2268 + - type: f1_weighted + value: 70.4722 + - type: main_score + value: 71.456 + task: + type: Classification + - dataset: + config: en + name: MTEB MassiveScenarioClassification (en) + revision: fad2c6e8459f9e1c45d9315f4953d921437d70f8 + split: test + type: mteb/amazon_massive_scenario + metrics: + - type: accuracy + value: 76.21719999999999 + - type: f1 + value: 75.14189999999999 + - type: f1_weighted + value: 76.0733 + - type: main_score + value: 76.21719999999999 + task: + type: Classification + - dataset: + config: default + name: MTEB MedrxivClusteringP2P (default) + revision: e7a26af6f3ae46b30dde8737f02c07b1505bcc73 + split: test + type: mteb/medrxiv-clustering-p2p + metrics: + - type: v_measure + value: 31.3917 + - type: v_measure_std + value: 1.4778 + - type: main_score + value: 31.3917 + task: + type: Clustering + - dataset: + config: default + name: MTEB MedrxivClusteringS2S (default) + revision: 35191c8c0dca72d8ff3efcd72aa802307d469663 + split: test + type: mteb/medrxiv-clustering-s2s + metrics: + - type: v_measure + value: 28.2408 + - type: v_measure_std + value: 1.1622999999999999 + - type: main_score + value: 28.2408 + task: + type: Clustering + - dataset: + config: default + name: MTEB MindSmallReranking (default) + revision: 59042f120c80e8afa9cdbb224f67076cec0fc9a7 + split: test + type: mteb/mind_small + metrics: + - type: map + value: 29.5796 + - type: mrr + value: 30.3081 + - type: nAUC_map_max + value: -24.9194 + - type: nAUC_map_std + value: -9.042 + - type: nAUC_map_diff1 + value: 12.1611 + - type: nAUC_mrr_max + value: -19.3867 + - type: nAUC_mrr_std + value: -6.3873 + - type: nAUC_mrr_diff1 + value: 11.8078 + - type: main_score + value: 29.5796 + task: + type: Reranking + - dataset: + config: default + name: MTEB NFCorpus (default) + revision: ec0fa4fe99da2ff19ca1214b7966684033a58814 + split: test + type: mteb/nfcorpus + metrics: + - type: ndcg_at_1 + value: 45.046 + - type: ndcg_at_3 + value: 41.704 + - type: ndcg_at_5 + value: 39.296 + - type: ndcg_at_10 + value: 35.343999999999994 + - type: ndcg_at_20 + value: 32.525999999999996 + - type: ndcg_at_100 + value: 31.352999999999998 + - type: ndcg_at_1000 + value: 39.772 + - type: map_at_1 + value: 5.833 + - type: map_at_3 + value: 9.953 + - type: map_at_5 + value: 11.549 + - type: map_at_10 + value: 13.38 + - type: map_at_20 + value: 14.706 + - type: map_at_100 + value: 16.422 + - type: map_at_1000 + value: 17.777 + - type: recall_at_1 + value: 5.833 + - type: recall_at_3 + value: 11.112 + - type: recall_at_5 + value: 13.834 + - type: recall_at_10 + value: 16.961000000000002 + - type: recall_at_20 + value: 20.294999999999998 + - type: recall_at_100 + value: 30.253000000000004 + - type: recall_at_1000 + value: 60.902 + - type: precision_at_1 + value: 46.44 + - type: precision_at_3 + value: 39.009 + - type: precision_at_5 + value: 33.745999999999995 + - type: precision_at_10 + value: 25.635 + - type: precision_at_20 + value: 18.576 + - type: precision_at_100 + value: 7.731000000000001 + - type: precision_at_1000 + value: 2.037 + - type: mrr_at_1 + value: 46.7492 + - type: mrr_at_3 + value: 54.6956 + - type: mrr_at_5 + value: 55.8875 + - type: mrr_at_10 + value: 56.3913 + - type: mrr_at_20 + value: 56.6265 + - type: mrr_at_100 + value: 56.815599999999996 + - type: mrr_at_1000 + value: 56.8573 + - type: nauc_ndcg_at_1_max + value: 43.3685 + - type: nauc_ndcg_at_1_std + value: 21.6124 + - type: nauc_ndcg_at_1_diff1 + value: 29.0317 + - type: nauc_ndcg_at_3_max + value: 39.8155 + - type: nauc_ndcg_at_3_std + value: 23.2206 + - type: nauc_ndcg_at_3_diff1 + value: 20.7425 + - type: nauc_ndcg_at_5_max + value: 40.951 + - type: nauc_ndcg_at_5_std + value: 24.7184 + - type: nauc_ndcg_at_5_diff1 + value: 19.098599999999998 + - type: nauc_ndcg_at_10_max + value: 41.4733 + - type: nauc_ndcg_at_10_std + value: 27.4588 + - type: nauc_ndcg_at_10_diff1 + value: 17.224800000000002 + - type: nauc_ndcg_at_20_max + value: 40.3519 + - type: nauc_ndcg_at_20_std + value: 27.2947 + - type: nauc_ndcg_at_20_diff1 + value: 16.502 + - type: nauc_ndcg_at_100_max + value: 44.0676 + - type: nauc_ndcg_at_100_std + value: 29.1921 + - type: nauc_ndcg_at_100_diff1 + value: 20.9199 + - type: nauc_ndcg_at_1000_max + value: 48.9082 + - type: nauc_ndcg_at_1000_std + value: 33.799600000000005 + - type: nauc_ndcg_at_1000_diff1 + value: 19.741600000000002 + - type: nauc_map_at_1_max + value: 19.2048 + - type: nauc_map_at_1_std + value: -13.564599999999999 + - type: nauc_map_at_1_diff1 + value: 37.601099999999995 + - type: nauc_map_at_3_max + value: 23.1853 + - type: nauc_map_at_3_std + value: -8.3204 + - type: nauc_map_at_3_diff1 + value: 32.5527 + - type: nauc_map_at_5_max + value: 26.747500000000002 + - type: nauc_map_at_5_std + value: -4.136 + - type: nauc_map_at_5_diff1 + value: 29.041800000000002 + - type: nauc_map_at_10_max + value: 30.492200000000004 + - type: nauc_map_at_10_std + value: 2.2847 + - type: nauc_map_at_10_diff1 + value: 25.949699999999996 + - type: nauc_map_at_20_max + value: 32.628800000000005 + - type: nauc_map_at_20_std + value: 6.2305 + - type: nauc_map_at_20_diff1 + value: 24.0997 + - type: nauc_map_at_100_max + value: 35.0282 + - type: nauc_map_at_100_std + value: 12.181899999999999 + - type: nauc_map_at_100_diff1 + value: 22.6844 + - type: nauc_map_at_1000_max + value: 35.274899999999995 + - type: nauc_map_at_1000_std + value: 14.9827 + - type: nauc_map_at_1000_diff1 + value: 21.4096 + - type: nauc_recall_at_1_max + value: 19.2048 + - type: nauc_recall_at_1_std + value: -13.564599999999999 + - type: nauc_recall_at_1_diff1 + value: 37.601099999999995 + - type: nauc_recall_at_3_max + value: 20.5895 + - type: nauc_recall_at_3_std + value: -7.8295 + - type: nauc_recall_at_3_diff1 + value: 28.4675 + - type: nauc_recall_at_5_max + value: 24.8771 + - type: nauc_recall_at_5_std + value: -2.869 + - type: nauc_recall_at_5_diff1 + value: 23.301 + - type: nauc_recall_at_10_max + value: 28.647299999999998 + - type: nauc_recall_at_10_std + value: 4.4991 + - type: nauc_recall_at_10_diff1 + value: 20.5606 + - type: nauc_recall_at_20_max + value: 30.3525 + - type: nauc_recall_at_20_std + value: 8.712 + - type: nauc_recall_at_20_diff1 + value: 17.4748 + - type: nauc_recall_at_100_max + value: 34.0702 + - type: nauc_recall_at_100_std + value: 23.3319 + - type: nauc_recall_at_100_diff1 + value: 17.2015 + - type: nauc_recall_at_1000_max + value: 27.8011 + - type: nauc_recall_at_1000_std + value: 21.6507 + - type: nauc_recall_at_1000_diff1 + value: 4.4638 + - type: nauc_precision_at_1_max + value: 44.6989 + - type: nauc_precision_at_1_std + value: 22.622 + - type: nauc_precision_at_1_diff1 + value: 28.881400000000003 + - type: nauc_precision_at_3_max + value: 39.4166 + - type: nauc_precision_at_3_std + value: 29.2591 + - type: nauc_precision_at_3_diff1 + value: 12.1577 + - type: nauc_precision_at_5_max + value: 39.6371 + - type: nauc_precision_at_5_std + value: 33.201 + - type: nauc_precision_at_5_diff1 + value: 7.958 + - type: nauc_precision_at_10_max + value: 38.2593 + - type: nauc_precision_at_10_std + value: 40.6097 + - type: nauc_precision_at_10_diff1 + value: 1.376 + - type: nauc_precision_at_20_max + value: 31.375999999999998 + - type: nauc_precision_at_20_std + value: 42.3468 + - type: nauc_precision_at_20_diff1 + value: -4.1699 + - type: nauc_precision_at_100_max + value: 16.628 + - type: nauc_precision_at_100_std + value: 41.800599999999996 + - type: nauc_precision_at_100_diff1 + value: -9.4674 + - type: nauc_precision_at_1000_max + value: 1.6051 + - type: nauc_precision_at_1000_std + value: 29.1306 + - type: nauc_precision_at_1000_diff1 + value: -11.1912 + - type: nauc_mrr_at_1_max + value: 44.4339 + - type: nauc_mrr_at_1_std + value: 23.6489 + - type: nauc_mrr_at_1_diff1 + value: 28.0393 + - type: nauc_mrr_at_3_max + value: 47.780899999999995 + - type: nauc_mrr_at_3_std + value: 31.412499999999998 + - type: nauc_mrr_at_3_diff1 + value: 24.1569 + - type: nauc_mrr_at_5_max + value: 48.732 + - type: nauc_mrr_at_5_std + value: 31.899100000000004 + - type: nauc_mrr_at_5_diff1 + value: 24.4177 + - type: nauc_mrr_at_10_max + value: 48.9748 + - type: nauc_mrr_at_10_std + value: 32.2053 + - type: nauc_mrr_at_10_diff1 + value: 24.0317 + - type: nauc_mrr_at_20_max + value: 49.0832 + - type: nauc_mrr_at_20_std + value: 32.0994 + - type: nauc_mrr_at_20_diff1 + value: 23.9777 + - type: nauc_mrr_at_100_max + value: 49.1731 + - type: nauc_mrr_at_100_std + value: 32.3179 + - type: nauc_mrr_at_100_diff1 + value: 24.081 + - type: nauc_mrr_at_1000_max + value: 49.1387 + - type: nauc_mrr_at_1000_std + value: 32.2738 + - type: nauc_mrr_at_1000_diff1 + value: 24.063200000000002 + - type: main_score + value: 35.343999999999994 + task: + type: Retrieval + - dataset: + config: default + name: MTEB NQ (default) + revision: b774495ed302d8c44a3a7ea25c90dbce03968f31 + split: test + type: mteb/nq + metrics: + - type: ndcg_at_1 + value: 44.93 + - type: ndcg_at_3 + value: 56.003 + - type: ndcg_at_5 + value: 60.150000000000006 + - type: ndcg_at_10 + value: 63.673 + - type: ndcg_at_20 + value: 65.211 + - type: ndcg_at_100 + value: 66.686 + - type: ndcg_at_1000 + value: 67.009 + - type: map_at_1 + value: 40.035 + - type: map_at_3 + value: 51.976 + - type: map_at_5 + value: 54.510999999999996 + - type: map_at_10 + value: 56.17100000000001 + - type: map_at_20 + value: 56.684 + - type: map_at_100 + value: 56.932 + - type: map_at_1000 + value: 56.946 + - type: recall_at_1 + value: 40.035 + - type: recall_at_3 + value: 64.224 + - type: recall_at_5 + value: 73.682 + - type: recall_at_10 + value: 83.809 + - type: recall_at_20 + value: 89.385 + - type: recall_at_100 + value: 96.705 + - type: recall_at_1000 + value: 99.054 + - type: precision_at_1 + value: 44.93 + - type: precision_at_3 + value: 25.019000000000002 + - type: precision_at_5 + value: 17.445 + - type: precision_at_10 + value: 10.043000000000001 + - type: precision_at_20 + value: 5.4 + - type: precision_at_100 + value: 1.174 + - type: precision_at_1000 + value: 0.121 + - type: mrr_at_1 + value: 44.9305 + - type: mrr_at_3 + value: 55.37370000000001 + - type: mrr_at_5 + value: 57.4464 + - type: mrr_at_10 + value: 58.680200000000006 + - type: mrr_at_20 + value: 59.0042 + - type: mrr_at_100 + value: 59.178799999999995 + - type: mrr_at_1000 + value: 59.188700000000004 + - type: nauc_ndcg_at_1_max + value: 23.8396 + - type: nauc_ndcg_at_1_std + value: -3.8885000000000005 + - type: nauc_ndcg_at_1_diff1 + value: 37.971500000000006 + - type: nauc_ndcg_at_3_max + value: 30.025800000000004 + - type: nauc_ndcg_at_3_std + value: -4.9848 + - type: nauc_ndcg_at_3_diff1 + value: 34.324799999999996 + - type: nauc_ndcg_at_5_max + value: 32.2984 + - type: nauc_ndcg_at_5_std + value: -3.263 + - type: nauc_ndcg_at_5_diff1 + value: 35.2865 + - type: nauc_ndcg_at_10_max + value: 32.4173 + - type: nauc_ndcg_at_10_std + value: -2.398 + - type: nauc_ndcg_at_10_diff1 + value: 34.767399999999995 + - type: nauc_ndcg_at_20_max + value: 32.332 + - type: nauc_ndcg_at_20_std + value: -1.7824 + - type: nauc_ndcg_at_20_diff1 + value: 35.0354 + - type: nauc_ndcg_at_100_max + value: 31.3774 + - type: nauc_ndcg_at_100_std + value: -1.4645 + - type: nauc_ndcg_at_100_diff1 + value: 35.255900000000004 + - type: nauc_ndcg_at_1000_max + value: 31.008799999999997 + - type: nauc_ndcg_at_1000_std + value: -1.9499 + - type: nauc_ndcg_at_1000_diff1 + value: 35.3522 + - type: nauc_map_at_1_max + value: 21.296300000000002 + - type: nauc_map_at_1_std + value: -6.0126 + - type: nauc_map_at_1_diff1 + value: 37.9216 + - type: nauc_map_at_3_max + value: 28.1195 + - type: nauc_map_at_3_std + value: -5.3494 + - type: nauc_map_at_3_diff1 + value: 35.0839 + - type: nauc_map_at_5_max + value: 29.365999999999996 + - type: nauc_map_at_5_std + value: -4.410200000000001 + - type: nauc_map_at_5_diff1 + value: 35.6342 + - type: nauc_map_at_10_max + value: 29.378300000000003 + - type: nauc_map_at_10_std + value: -4.0228 + - type: nauc_map_at_10_diff1 + value: 35.451 + - type: nauc_map_at_20_max + value: 29.3604 + - type: nauc_map_at_20_std + value: -3.7953 + - type: nauc_map_at_20_diff1 + value: 35.5496 + - type: nauc_map_at_100_max + value: 29.233199999999997 + - type: nauc_map_at_100_std + value: -3.7321 + - type: nauc_map_at_100_diff1 + value: 35.574099999999994 + - type: nauc_map_at_1000_max + value: 29.2215 + - type: nauc_map_at_1000_std + value: -3.7482 + - type: nauc_map_at_1000_diff1 + value: 35.5805 + - type: nauc_recall_at_1_max + value: 21.296300000000002 + - type: nauc_recall_at_1_std + value: -6.0126 + - type: nauc_recall_at_1_diff1 + value: 37.9216 + - type: nauc_recall_at_3_max + value: 34.2599 + - type: nauc_recall_at_3_std + value: -5.5474000000000006 + - type: nauc_recall_at_3_diff1 + value: 30.7103 + - type: nauc_recall_at_5_max + value: 41.6689 + - type: nauc_recall_at_5_std + value: -0.7705 + - type: nauc_recall_at_5_diff1 + value: 32.6001 + - type: nauc_recall_at_10_max + value: 47.236200000000004 + - type: nauc_recall_at_10_std + value: 3.9309999999999996 + - type: nauc_recall_at_10_diff1 + value: 29.277199999999997 + - type: nauc_recall_at_20_max + value: 53.957100000000004 + - type: nauc_recall_at_20_std + value: 11.282499999999999 + - type: nauc_recall_at_20_diff1 + value: 29.7674 + - type: nauc_recall_at_100_max + value: 66.87039999999999 + - type: nauc_recall_at_100_std + value: 46.8733 + - type: nauc_recall_at_100_diff1 + value: 30.0249 + - type: nauc_recall_at_1000_max + value: 88.33670000000001 + - type: nauc_recall_at_1000_std + value: 77.0724 + - type: nauc_recall_at_1000_diff1 + value: 34.0192 + - type: nauc_precision_at_1_max + value: 23.8396 + - type: nauc_precision_at_1_std + value: -3.8885000000000005 + - type: nauc_precision_at_1_diff1 + value: 37.971500000000006 + - type: nauc_precision_at_3_max + value: 31.053399999999996 + - type: nauc_precision_at_3_std + value: 0.3766 + - type: nauc_precision_at_3_diff1 + value: 21.5732 + - type: nauc_precision_at_5_max + value: 30.816100000000002 + - type: nauc_precision_at_5_std + value: 5.3659 + - type: nauc_precision_at_5_diff1 + value: 17.4728 + - type: nauc_precision_at_10_max + value: 25.204300000000003 + - type: nauc_precision_at_10_std + value: 10.6652 + - type: nauc_precision_at_10_diff1 + value: 7.7665 + - type: nauc_precision_at_20_max + value: 20.3015 + - type: nauc_precision_at_20_std + value: 14.1789 + - type: nauc_precision_at_20_diff1 + value: 3.2251000000000003 + - type: nauc_precision_at_100_max + value: 9.709 + - type: nauc_precision_at_100_std + value: 17.7706 + - type: nauc_precision_at_100_diff1 + value: -5.5258 + - type: nauc_precision_at_1000_max + value: 4.5083 + - type: nauc_precision_at_1000_std + value: 14.754900000000001 + - type: nauc_precision_at_1000_diff1 + value: -8.1761 + - type: nauc_mrr_at_1_max + value: 23.8396 + - type: nauc_mrr_at_1_std + value: -3.8885000000000005 + - type: nauc_mrr_at_1_diff1 + value: 37.971500000000006 + - type: nauc_mrr_at_3_max + value: 28.9257 + - type: nauc_mrr_at_3_std + value: -3.6295 + - type: nauc_mrr_at_3_diff1 + value: 35.390100000000004 + - type: nauc_mrr_at_5_max + value: 29.8503 + - type: nauc_mrr_at_5_std + value: -2.8144 + - type: nauc_mrr_at_5_diff1 + value: 35.8786 + - type: nauc_mrr_at_10_max + value: 29.662899999999997 + - type: nauc_mrr_at_10_std + value: -2.6432 + - type: nauc_mrr_at_10_diff1 + value: 35.708400000000005 + - type: nauc_mrr_at_20_max + value: 29.5659 + - type: nauc_mrr_at_20_std + value: -2.6337 + - type: nauc_mrr_at_20_diff1 + value: 35.761900000000004 + - type: nauc_mrr_at_100_max + value: 29.432399999999998 + - type: nauc_mrr_at_100_std + value: -2.6328 + - type: nauc_mrr_at_100_diff1 + value: 35.8182 + - type: nauc_mrr_at_1000_max + value: 29.4234 + - type: nauc_mrr_at_1000_std + value: -2.6451 + - type: nauc_mrr_at_1000_diff1 + value: 35.8215 + - type: main_score + value: 63.673 + task: + type: Retrieval + - dataset: + config: default + name: MTEB QuoraRetrieval (default) + revision: e4e08e0b7dbe3c8700f0daef558ff32256715259 + split: test + type: mteb/quora + metrics: + - type: ndcg_at_1 + value: 82.27 + - type: ndcg_at_3 + value: 86.28099999999999 + - type: ndcg_at_5 + value: 87.81400000000001 + - type: ndcg_at_10 + value: 89.021 + - type: ndcg_at_20 + value: 89.643 + - type: ndcg_at_100 + value: 90.13 + - type: ndcg_at_1000 + value: 90.226 + - type: map_at_1 + value: 71.43599999999999 + - type: map_at_3 + value: 82.49 + - type: map_at_5 + value: 84.331 + - type: map_at_10 + value: 85.416 + - type: map_at_20 + value: 85.827 + - type: map_at_100 + value: 86.024 + - type: map_at_1000 + value: 86.039 + - type: recall_at_1 + value: 71.43599999999999 + - type: recall_at_3 + value: 87.912 + - type: recall_at_5 + value: 92.30000000000001 + - type: recall_at_10 + value: 95.814 + - type: recall_at_20 + value: 97.80799999999999 + - type: recall_at_100 + value: 99.551 + - type: recall_at_1000 + value: 99.97 + - type: precision_at_1 + value: 82.27 + - type: precision_at_3 + value: 37.747 + - type: precision_at_5 + value: 24.782 + - type: precision_at_10 + value: 13.497 + - type: precision_at_20 + value: 7.147 + - type: precision_at_100 + value: 1.529 + - type: precision_at_1000 + value: 0.157 + - type: mrr_at_1 + value: 82.23 + - type: mrr_at_3 + value: 87.26 + - type: mrr_at_5 + value: 87.9305 + - type: mrr_at_10 + value: 88.20949999999999 + - type: mrr_at_20 + value: 88.2764 + - type: mrr_at_100 + value: 88.2967 + - type: mrr_at_1000 + value: 88.2976 + - type: nauc_ndcg_at_1_max + value: 37.0736 + - type: nauc_ndcg_at_1_std + value: -43.2326 + - type: nauc_ndcg_at_1_diff1 + value: 77.9945 + - type: nauc_ndcg_at_3_max + value: 33.9426 + - type: nauc_ndcg_at_3_std + value: -51.3108 + - type: nauc_ndcg_at_3_diff1 + value: 76.2559 + - type: nauc_ndcg_at_5_max + value: 34.927 + - type: nauc_ndcg_at_5_std + value: -52.50749999999999 + - type: nauc_ndcg_at_5_diff1 + value: 76.578 + - type: nauc_ndcg_at_10_max + value: 35.9905 + - type: nauc_ndcg_at_10_std + value: -51.808699999999995 + - type: nauc_ndcg_at_10_diff1 + value: 76.6957 + - type: nauc_ndcg_at_20_max + value: 36.119299999999996 + - type: nauc_ndcg_at_20_std + value: -50.1628 + - type: nauc_ndcg_at_20_diff1 + value: 76.6659 + - type: nauc_ndcg_at_100_max + value: 36.4315 + - type: nauc_ndcg_at_100_std + value: -48.0358 + - type: nauc_ndcg_at_100_diff1 + value: 76.5866 + - type: nauc_ndcg_at_1000_max + value: 36.459399999999995 + - type: nauc_ndcg_at_1000_std + value: -47.834199999999996 + - type: nauc_ndcg_at_1000_diff1 + value: 76.5791 + - type: nauc_map_at_1_max + value: 25.902199999999997 + - type: nauc_map_at_1_std + value: -44.6605 + - type: nauc_map_at_1_diff1 + value: 80.78070000000001 + - type: nauc_map_at_3_max + value: 31.3371 + - type: nauc_map_at_3_std + value: -53.9334 + - type: nauc_map_at_3_diff1 + value: 77.7089 + - type: nauc_map_at_5_max + value: 33.1663 + - type: nauc_map_at_5_std + value: -53.86919999999999 + - type: nauc_map_at_5_diff1 + value: 77.32430000000001 + - type: nauc_map_at_10_max + value: 34.4253 + - type: nauc_map_at_10_std + value: -52.423500000000004 + - type: nauc_map_at_10_diff1 + value: 77.0479 + - type: nauc_map_at_20_max + value: 34.6738 + - type: nauc_map_at_20_std + value: -51.095400000000005 + - type: nauc_map_at_20_diff1 + value: 76.88810000000001 + - type: nauc_map_at_100_max + value: 34.7984 + - type: nauc_map_at_100_std + value: -50.2705 + - type: nauc_map_at_100_diff1 + value: 76.8083 + - type: nauc_map_at_1000_max + value: 34.8162 + - type: nauc_map_at_1000_std + value: -50.211600000000004 + - type: nauc_map_at_1000_diff1 + value: 76.8047 + - type: nauc_recall_at_1_max + value: 25.902199999999997 + - type: nauc_recall_at_1_std + value: -44.6605 + - type: nauc_recall_at_1_diff1 + value: 80.78070000000001 + - type: nauc_recall_at_3_max + value: 27.693 + - type: nauc_recall_at_3_std + value: -61.799400000000006 + - type: nauc_recall_at_3_diff1 + value: 74.25 + - type: nauc_recall_at_5_max + value: 30.216700000000003 + - type: nauc_recall_at_5_std + value: -68.2919 + - type: nauc_recall_at_5_diff1 + value: 72.8613 + - type: nauc_recall_at_10_max + value: 34.4765 + - type: nauc_recall_at_10_std + value: -74.3633 + - type: nauc_recall_at_10_diff1 + value: 73.0316 + - type: nauc_recall_at_20_max + value: 33.812 + - type: nauc_recall_at_20_std + value: -72.8956 + - type: nauc_recall_at_20_diff1 + value: 73.4475 + - type: nauc_recall_at_100_max + value: 39.0326 + - type: nauc_recall_at_100_std + value: -42.9628 + - type: nauc_recall_at_100_diff1 + value: 72.66669999999999 + - type: nauc_recall_at_1000_max + value: 16.4069 + - type: nauc_recall_at_1000_std + value: 20.353099999999998 + - type: nauc_recall_at_1000_diff1 + value: 72.6857 + - type: nauc_precision_at_1_max + value: 37.0736 + - type: nauc_precision_at_1_std + value: -43.2326 + - type: nauc_precision_at_1_diff1 + value: 77.9945 + - type: nauc_precision_at_3_max + value: 7.225099999999999 + - type: nauc_precision_at_3_std + value: 5.4519 + - type: nauc_precision_at_3_diff1 + value: -20.1979 + - type: nauc_precision_at_5_max + value: 3.1125 + - type: nauc_precision_at_5_std + value: 17.542099999999998 + - type: nauc_precision_at_5_diff1 + value: -32.5768 + - type: nauc_precision_at_10_max + value: -0.3758 + - type: nauc_precision_at_10_std + value: 27.9681 + - type: nauc_precision_at_10_diff1 + value: -39.8065 + - type: nauc_precision_at_20_max + value: -2.7107 + - type: nauc_precision_at_20_std + value: 34.9186 + - type: nauc_precision_at_20_diff1 + value: -42.686800000000005 + - type: nauc_precision_at_100_max + value: -4.587 + - type: nauc_precision_at_100_std + value: 41.415600000000005 + - type: nauc_precision_at_100_diff1 + value: -44.357 + - type: nauc_precision_at_1000_max + value: -5.003 + - type: nauc_precision_at_1000_std + value: 42.5355 + - type: nauc_precision_at_1000_diff1 + value: -44.5697 + - type: nauc_mrr_at_1_max + value: 37.1298 + - type: nauc_mrr_at_1_std + value: -43.2774 + - type: nauc_mrr_at_1_diff1 + value: 78.0714 + - type: nauc_mrr_at_3_max + value: 37.644800000000004 + - type: nauc_mrr_at_3_std + value: -46.231 + - type: nauc_mrr_at_3_diff1 + value: 77.0599 + - type: nauc_mrr_at_5_max + value: 37.994299999999996 + - type: nauc_mrr_at_5_std + value: -46.0511 + - type: nauc_mrr_at_5_diff1 + value: 77.1377 + - type: nauc_mrr_at_10_max + value: 37.9206 + - type: nauc_mrr_at_10_std + value: -45.8065 + - type: nauc_mrr_at_10_diff1 + value: 77.1994 + - type: nauc_mrr_at_20_max + value: 37.8028 + - type: nauc_mrr_at_20_std + value: -45.7095 + - type: nauc_mrr_at_20_diff1 + value: 77.2152 + - type: nauc_mrr_at_100_max + value: 37.7912 + - type: nauc_mrr_at_100_std + value: -45.6767 + - type: nauc_mrr_at_100_diff1 + value: 77.2139 + - type: nauc_mrr_at_1000_max + value: 37.79 + - type: nauc_mrr_at_1000_std + value: -45.6766 + - type: nauc_mrr_at_1000_diff1 + value: 77.2145 + - type: main_score + value: 89.021 + task: + type: Retrieval + - dataset: + config: default + name: MTEB RedditClustering (default) + revision: 24640382cdbf8abc73003fb0fa6d111a705499eb + split: test + type: mteb/reddit-clustering + metrics: + - type: v_measure + value: 51.208600000000004 + - type: v_measure_std + value: 4.2761000000000005 + - type: main_score + value: 51.208600000000004 + task: + type: Clustering + - dataset: + config: default + name: MTEB RedditClusteringP2P (default) + revision: 385e3cb46b4cfa89021f56c4380204149d0efe33 + split: test + type: mteb/reddit-clustering-p2p + metrics: + - type: v_measure + value: 60.372899999999994 + - type: v_measure_std + value: 12.0829 + - type: main_score + value: 60.372899999999994 + task: + type: Clustering + - dataset: + config: default + name: MTEB SCIDOCS (default) + revision: f8c2fcf00f625baaa80f62ec5bd9e1fff3b8ae88 + split: test + type: mteb/scidocs + metrics: + - type: ndcg_at_1 + value: 22.400000000000002 + - type: ndcg_at_3 + value: 19.192 + - type: ndcg_at_5 + value: 16.767000000000003 + - type: ndcg_at_10 + value: 20.238999999999997 + - type: ndcg_at_20 + value: 22.720000000000002 + - type: ndcg_at_100 + value: 27.567999999999998 + - type: ndcg_at_1000 + value: 32.535 + - type: map_at_1 + value: 4.552 + - type: map_at_3 + value: 8.495999999999999 + - type: map_at_5 + value: 10.213999999999999 + - type: map_at_10 + value: 11.985 + - type: map_at_20 + value: 12.937000000000001 + - type: map_at_100 + value: 13.885 + - type: map_at_1000 + value: 14.155999999999999 + - type: recall_at_1 + value: 4.552 + - type: recall_at_3 + value: 11.067 + - type: recall_at_5 + value: 15.052 + - type: recall_at_10 + value: 21.422 + - type: recall_at_20 + value: 27.279999999999998 + - type: recall_at_100 + value: 42.968 + - type: recall_at_1000 + value: 67.232 + - type: precision_at_1 + value: 22.400000000000002 + - type: precision_at_3 + value: 18.2 + - type: precision_at_5 + value: 14.860000000000001 + - type: precision_at_10 + value: 10.58 + - type: precision_at_20 + value: 6.715 + - type: precision_at_100 + value: 2.114 + - type: precision_at_1000 + value: 0.331 + - type: mrr_at_1 + value: 22.400000000000002 + - type: mrr_at_3 + value: 31.0833 + - type: mrr_at_5 + value: 32.853300000000004 + - type: mrr_at_10 + value: 34.2814 + - type: mrr_at_20 + value: 34.814 + - type: mrr_at_100 + value: 35.2576 + - type: mrr_at_1000 + value: 35.322199999999995 + - type: nauc_ndcg_at_1_max + value: 23.7575 + - type: nauc_ndcg_at_1_std + value: 4.1697 + - type: nauc_ndcg_at_1_diff1 + value: 28.3995 + - type: nauc_ndcg_at_3_max + value: 27.5517 + - type: nauc_ndcg_at_3_std + value: 8.8005 + - type: nauc_ndcg_at_3_diff1 + value: 22.334799999999998 + - type: nauc_ndcg_at_5_max + value: 28.607599999999998 + - type: nauc_ndcg_at_5_std + value: 10.0785 + - type: nauc_ndcg_at_5_diff1 + value: 21.4713 + - type: nauc_ndcg_at_10_max + value: 30.812099999999997 + - type: nauc_ndcg_at_10_std + value: 14.4374 + - type: nauc_ndcg_at_10_diff1 + value: 20.5304 + - type: nauc_ndcg_at_20_max + value: 32.3888 + - type: nauc_ndcg_at_20_std + value: 17.8152 + - type: nauc_ndcg_at_20_diff1 + value: 20.2815 + - type: nauc_ndcg_at_100_max + value: 34.402100000000004 + - type: nauc_ndcg_at_100_std + value: 22.3694 + - type: nauc_ndcg_at_100_diff1 + value: 20.9422 + - type: nauc_ndcg_at_1000_max + value: 33.7269 + - type: nauc_ndcg_at_1000_std + value: 23.646700000000003 + - type: nauc_ndcg_at_1000_diff1 + value: 19.7226 + - type: nauc_map_at_1_max + value: 23.5069 + - type: nauc_map_at_1_std + value: 3.8736 + - type: nauc_map_at_1_diff1 + value: 28.231 + - type: nauc_map_at_3_max + value: 27.293 + - type: nauc_map_at_3_std + value: 6.9329 + - type: nauc_map_at_3_diff1 + value: 21.8664 + - type: nauc_map_at_5_max + value: 28.591100000000004 + - type: nauc_map_at_5_std + value: 8.2248 + - type: nauc_map_at_5_diff1 + value: 21.4395 + - type: nauc_map_at_10_max + value: 30.417300000000004 + - type: nauc_map_at_10_std + value: 11.615300000000001 + - type: nauc_map_at_10_diff1 + value: 20.624000000000002 + - type: nauc_map_at_20_max + value: 31.479200000000002 + - type: nauc_map_at_20_std + value: 13.808699999999998 + - type: nauc_map_at_20_diff1 + value: 20.413 + - type: nauc_map_at_100_max + value: 32.2613 + - type: nauc_map_at_100_std + value: 15.5692 + - type: nauc_map_at_100_diff1 + value: 20.5465 + - type: nauc_map_at_1000_max + value: 32.2476 + - type: nauc_map_at_1000_std + value: 15.7471 + - type: nauc_map_at_1000_diff1 + value: 20.4622 + - type: nauc_recall_at_1_max + value: 23.5069 + - type: nauc_recall_at_1_std + value: 3.8736 + - type: nauc_recall_at_1_diff1 + value: 28.231 + - type: nauc_recall_at_3_max + value: 27.970299999999998 + - type: nauc_recall_at_3_std + value: 10.2171 + - type: nauc_recall_at_3_diff1 + value: 19.403699999999997 + - type: nauc_recall_at_5_max + value: 28.4521 + - type: nauc_recall_at_5_std + value: 12.2105 + - type: nauc_recall_at_5_diff1 + value: 17.5747 + - type: nauc_recall_at_10_max + value: 30.6955 + - type: nauc_recall_at_10_std + value: 19.096 + - type: nauc_recall_at_10_diff1 + value: 15.3116 + - type: nauc_recall_at_20_max + value: 32.1047 + - type: nauc_recall_at_20_std + value: 24.823600000000003 + - type: nauc_recall_at_20_diff1 + value: 14.257700000000002 + - type: nauc_recall_at_100_max + value: 33.6062 + - type: nauc_recall_at_100_std + value: 33.8641 + - type: nauc_recall_at_100_diff1 + value: 14.5145 + - type: nauc_recall_at_1000_max + value: 26.848300000000002 + - type: nauc_recall_at_1000_std + value: 38.5884 + - type: nauc_recall_at_1000_diff1 + value: 5.6408 + - type: nauc_precision_at_1_max + value: 23.7575 + - type: nauc_precision_at_1_std + value: 4.1697 + - type: nauc_precision_at_1_diff1 + value: 28.3995 + - type: nauc_precision_at_3_max + value: 28.2504 + - type: nauc_precision_at_3_std + value: 10.6227 + - type: nauc_precision_at_3_diff1 + value: 19.5683 + - type: nauc_precision_at_5_max + value: 28.8134 + - type: nauc_precision_at_5_std + value: 12.518899999999999 + - type: nauc_precision_at_5_diff1 + value: 17.8036 + - type: nauc_precision_at_10_max + value: 30.9813 + - type: nauc_precision_at_10_std + value: 19.3506 + - type: nauc_precision_at_10_diff1 + value: 15.512 + - type: nauc_precision_at_20_max + value: 32.6743 + - type: nauc_precision_at_20_std + value: 24.9974 + - type: nauc_precision_at_20_diff1 + value: 14.794099999999998 + - type: nauc_precision_at_100_max + value: 34.413700000000006 + - type: nauc_precision_at_100_std + value: 34.0889 + - type: nauc_precision_at_100_diff1 + value: 15.252699999999999 + - type: nauc_precision_at_1000_max + value: 27.3954 + - type: nauc_precision_at_1000_std + value: 37.8895 + - type: nauc_precision_at_1000_diff1 + value: 6.587999999999999 + - type: nauc_mrr_at_1_max + value: 23.7575 + - type: nauc_mrr_at_1_std + value: 4.1697 + - type: nauc_mrr_at_1_diff1 + value: 28.3995 + - type: nauc_mrr_at_3_max + value: 26.8324 + - type: nauc_mrr_at_3_std + value: 8.646700000000001 + - type: nauc_mrr_at_3_diff1 + value: 25.5754 + - type: nauc_mrr_at_5_max + value: 26.8274 + - type: nauc_mrr_at_5_std + value: 8.911 + - type: nauc_mrr_at_5_diff1 + value: 25.106 + - type: nauc_mrr_at_10_max + value: 27.073399999999996 + - type: nauc_mrr_at_10_std + value: 9.7624 + - type: nauc_mrr_at_10_diff1 + value: 24.9405 + - type: nauc_mrr_at_20_max + value: 27.1229 + - type: nauc_mrr_at_20_std + value: 10.0676 + - type: nauc_mrr_at_20_diff1 + value: 24.8122 + - type: nauc_mrr_at_100_max + value: 27.1391 + - type: nauc_mrr_at_100_std + value: 9.9628 + - type: nauc_mrr_at_100_diff1 + value: 24.9507 + - type: nauc_mrr_at_1000_max + value: 27.114 + - type: nauc_mrr_at_1000_std + value: 9.9537 + - type: nauc_mrr_at_1000_diff1 + value: 24.9421 + - type: main_score + value: 20.238999999999997 + task: + type: Retrieval + - dataset: + config: default + name: MTEB SICK-R (default) + revision: 20a6d6f312dd54037fe07a32d58e5e168867909d + split: test + type: mteb/sickr-sts + metrics: + - type: pearson + value: 79.5908 + - type: spearman + value: 73.9888 + - type: cosine_pearson + value: 79.5908 + - type: cosine_spearman + value: 73.9888 + - type: manhattan_pearson + value: 77.0623 + - type: manhattan_spearman + value: 73.7724 + - type: euclidean_pearson + value: 77.30890000000001 + - type: euclidean_spearman + value: 73.9888 + - type: main_score + value: 73.9888 + task: + type: STS + - dataset: + config: default + name: MTEB STS12 (default) + revision: a0d554a64d88156834ff5ae9920b964011b16384 + split: test + type: mteb/sts12-sts + metrics: + - type: pearson + value: 74.0752 + - type: spearman + value: 71.22699999999999 + - type: cosine_pearson + value: 74.0752 + - type: cosine_spearman + value: 71.22699999999999 + - type: manhattan_pearson + value: 70.6037 + - type: manhattan_spearman + value: 70.9916 + - type: euclidean_pearson + value: 70.922 + - type: euclidean_spearman + value: 71.22699999999999 + - type: main_score + value: 71.22699999999999 + task: + type: STS + - dataset: + config: default + name: MTEB STS13 (default) + revision: 7e90230a92c190f1bf69ae9002b8cea547a64cca + split: test + type: mteb/sts13-sts + metrics: + - type: pearson + value: 77.8946 + - type: spearman + value: 80.4405 + - type: cosine_pearson + value: 77.8946 + - type: cosine_spearman + value: 80.4405 + - type: manhattan_pearson + value: 79.6856 + - type: manhattan_spearman + value: 80.1236 + - type: euclidean_pearson + value: 80.0315 + - type: euclidean_spearman + value: 80.44059999999999 + - type: main_score + value: 80.4405 + task: + type: STS + - dataset: + config: default + name: MTEB STS14 (default) + revision: 6031580fec1f6af667f0bd2da0a551cf4f0b2375 + split: test + type: mteb/sts14-sts + metrics: + - type: pearson + value: 76.2196 + - type: spearman + value: 75.10419999999999 + - type: cosine_pearson + value: 76.2196 + - type: cosine_spearman + value: 75.10419999999999 + - type: manhattan_pearson + value: 75.4647 + - type: manhattan_spearman + value: 74.81179999999999 + - type: euclidean_pearson + value: 75.8091 + - type: euclidean_spearman + value: 75.10419999999999 + - type: main_score + value: 75.10419999999999 + task: + type: STS + - dataset: + config: default + name: MTEB STS15 (default) + revision: ae752c7c21bf194d8b67fd573edf7ae58183cbe3 + split: test + type: mteb/sts15-sts + metrics: + - type: pearson + value: 81.2455 + - type: spearman + value: 82.8681 + - type: cosine_pearson + value: 81.2455 + - type: cosine_spearman + value: 82.8681 + - type: manhattan_pearson + value: 82.4327 + - type: manhattan_spearman + value: 82.7513 + - type: euclidean_pearson + value: 82.5635 + - type: euclidean_spearman + value: 82.8681 + - type: main_score + value: 82.8681 + task: + type: STS + - dataset: + config: default + name: MTEB STS16 (default) + revision: 4d8694f8f0e0100860b497b999b3dbed754a0513 + split: test + type: mteb/sts16-sts + metrics: + - type: pearson + value: 81.6322 + - type: spearman + value: 83.487 + - type: cosine_pearson + value: 81.6322 + - type: cosine_spearman + value: 83.487 + - type: manhattan_pearson + value: 83.0048 + - type: manhattan_spearman + value: 83.4064 + - type: euclidean_pearson + value: 83.0938 + - type: euclidean_spearman + value: 83.487 + - type: main_score + value: 83.487 + task: + type: STS + - dataset: + config: en-en + name: MTEB STS17 (en-en) + revision: faeb762787bd10488a50c8b5be4a3b82e411949c + split: test + type: mteb/sts17-crosslingual-sts + metrics: + - type: pearson + value: 81.1124 + - type: spearman + value: 84.5436 + - type: cosine_pearson + value: 81.1124 + - type: cosine_spearman + value: 84.5436 + - type: manhattan_pearson + value: 83.5158 + - type: manhattan_spearman + value: 84.596 + - type: euclidean_pearson + value: 83.4429 + - type: euclidean_spearman + value: 84.5436 + - type: main_score + value: 84.5436 + task: + type: STS + - dataset: + config: en-tr + name: MTEB STS17 (en-tr) + revision: faeb762787bd10488a50c8b5be4a3b82e411949c + split: test + type: mteb/sts17-crosslingual-sts + metrics: + - type: pearson + value: 62.0001 + - type: spearman + value: 63.631099999999996 + - type: cosine_pearson + value: 62.0001 + - type: cosine_spearman + value: 63.631099999999996 + - type: manhattan_pearson + value: 62.239599999999996 + - type: manhattan_spearman + value: 62.892199999999995 + - type: euclidean_pearson + value: 62.9809 + - type: euclidean_spearman + value: 63.631099999999996 + - type: main_score + value: 63.631099999999996 + task: + type: STS + - dataset: + config: it-en + name: MTEB STS17 (it-en) + revision: faeb762787bd10488a50c8b5be4a3b82e411949c + split: test + type: mteb/sts17-crosslingual-sts + metrics: + - type: pearson + value: 75.1556 + - type: spearman + value: 76.8807 + - type: cosine_pearson + value: 75.1556 + - type: cosine_spearman + value: 76.8807 + - type: manhattan_pearson + value: 76.2428 + - type: manhattan_spearman + value: 76.8101 + - type: euclidean_pearson + value: 76.107 + - type: euclidean_spearman + value: 76.8807 + - type: main_score + value: 76.8807 + task: + type: STS + - dataset: + config: es-en + name: MTEB STS17 (es-en) + revision: faeb762787bd10488a50c8b5be4a3b82e411949c + split: test + type: mteb/sts17-crosslingual-sts + metrics: + - type: pearson + value: 69.85719999999999 + - type: spearman + value: 71.0489 + - type: cosine_pearson + value: 69.85719999999999 + - type: cosine_spearman + value: 71.0489 + - type: manhattan_pearson + value: 71.08449999999999 + - type: manhattan_spearman + value: 71.0051 + - type: euclidean_pearson + value: 71.19760000000001 + - type: euclidean_spearman + value: 71.0489 + - type: main_score + value: 71.0489 + task: + type: STS + - dataset: + config: nl-en + name: MTEB STS17 (nl-en) + revision: faeb762787bd10488a50c8b5be4a3b82e411949c + split: test + type: mteb/sts17-crosslingual-sts + metrics: + - type: pearson + value: 76.1131 + - type: spearman + value: 78.2714 + - type: cosine_pearson + value: 76.1131 + - type: cosine_spearman + value: 78.2714 + - type: manhattan_pearson + value: 76.70270000000001 + - type: manhattan_spearman + value: 77.7803 + - type: euclidean_pearson + value: 77.14269999999999 + - type: euclidean_spearman + value: 78.2714 + - type: main_score + value: 78.2714 + task: + type: STS + - dataset: + config: fr-en + name: MTEB STS17 (fr-en) + revision: faeb762787bd10488a50c8b5be4a3b82e411949c + split: test + type: mteb/sts17-crosslingual-sts + metrics: + - type: pearson + value: 74.49719999999999 + - type: spearman + value: 76.2747 + - type: cosine_pearson + value: 74.49719999999999 + - type: cosine_spearman + value: 76.2747 + - type: manhattan_pearson + value: 75.071 + - type: manhattan_spearman + value: 75.8969 + - type: euclidean_pearson + value: 75.289 + - type: euclidean_spearman + value: 76.2747 + - type: main_score + value: 76.2747 + task: + type: STS + - dataset: + config: en-de + name: MTEB STS17 (en-de) + revision: faeb762787bd10488a50c8b5be4a3b82e411949c + split: test + type: mteb/sts17-crosslingual-sts + metrics: + - type: pearson + value: 76.7073 + - type: spearman + value: 79.3107 + - type: cosine_pearson + value: 76.7073 + - type: cosine_spearman + value: 79.3107 + - type: manhattan_pearson + value: 77.9578 + - type: manhattan_spearman + value: 79.3195 + - type: euclidean_pearson + value: 77.7386 + - type: euclidean_spearman + value: 79.3107 + - type: main_score + value: 79.3107 + task: + type: STS + - dataset: + config: en-ar + name: MTEB STS17 (en-ar) + revision: faeb762787bd10488a50c8b5be4a3b82e411949c + split: test + type: mteb/sts17-crosslingual-sts + metrics: + - type: pearson + value: 60.5826 + - type: spearman + value: 61.0502 + - type: cosine_pearson + value: 60.5826 + - type: cosine_spearman + value: 61.0502 + - type: manhattan_pearson + value: 61.202 + - type: manhattan_spearman + value: 61.2039 + - type: euclidean_pearson + value: 61.1915 + - type: euclidean_spearman + value: 61.0502 + - type: main_score + value: 61.0502 + task: + type: STS + - dataset: + config: en + name: MTEB STS22 (en) + revision: de9d86b3b84231dc21f76c7b7af1f28e2f57f6e3 + split: test + type: mteb/sts22-crosslingual-sts + metrics: + - type: pearson + value: 69.2521 + - type: spearman + value: 68.06219999999999 + - type: cosine_pearson + value: 69.2521 + - type: cosine_spearman + value: 68.06219999999999 + - type: manhattan_pearson + value: 70.5115 + - type: manhattan_spearman + value: 67.8705 + - type: euclidean_pearson + value: 70.68480000000001 + - type: euclidean_spearman + value: 68.06219999999999 + - type: main_score + value: 68.06219999999999 + task: + type: STS + - dataset: + config: pl-en + name: MTEB STS22 (pl-en) + revision: de9d86b3b84231dc21f76c7b7af1f28e2f57f6e3 + split: test + type: mteb/sts22-crosslingual-sts + metrics: + - type: pearson + value: 77.97500000000001 + - type: spearman + value: 76.848 + - type: cosine_pearson + value: 77.97500000000001 + - type: cosine_spearman + value: 76.848 + - type: manhattan_pearson + value: 76.4098 + - type: manhattan_spearman + value: 76.6188 + - type: euclidean_pearson + value: 77.17500000000001 + - type: euclidean_spearman + value: 76.848 + - type: main_score + value: 76.848 + task: + type: STS + - dataset: + config: zh-en + name: MTEB STS22 (zh-en) + revision: de9d86b3b84231dc21f76c7b7af1f28e2f57f6e3 + split: test + type: mteb/sts22-crosslingual-sts + metrics: + - type: pearson + value: 71.3604 + - type: spearman + value: 70.7891 + - type: cosine_pearson + value: 71.3604 + - type: cosine_spearman + value: 70.7891 + - type: manhattan_pearson + value: 73.0185 + - type: manhattan_spearman + value: 70.79299999999999 + - type: euclidean_pearson + value: 73.17620000000001 + - type: euclidean_spearman + value: 70.7891 + - type: main_score + value: 70.7891 + task: + type: STS + - dataset: + config: es-en + name: MTEB STS22 (es-en) + revision: de9d86b3b84231dc21f76c7b7af1f28e2f57f6e3 + split: test + type: mteb/sts22-crosslingual-sts + metrics: + - type: pearson + value: 77.58239999999999 + - type: spearman + value: 78.5907 + - type: cosine_pearson + value: 77.58239999999999 + - type: cosine_spearman + value: 78.5907 + - type: manhattan_pearson + value: 79.25720000000001 + - type: manhattan_spearman + value: 78.6249 + - type: euclidean_pearson + value: 79.3724 + - type: euclidean_spearman + value: 78.5907 + - type: main_score + value: 78.5907 + task: + type: STS + - dataset: + config: de-en + name: MTEB STS22 (de-en) + revision: de9d86b3b84231dc21f76c7b7af1f28e2f57f6e3 + split: test + type: mteb/sts22-crosslingual-sts + metrics: + - type: pearson + value: 63.324000000000005 + - type: spearman + value: 55.1099 + - type: cosine_pearson + value: 63.324000000000005 + - type: cosine_spearman + value: 55.1099 + - type: manhattan_pearson + value: 67.3128 + - type: manhattan_spearman + value: 56.340199999999996 + - type: euclidean_pearson + value: 67.12089999999999 + - type: euclidean_spearman + value: 55.1099 + - type: main_score + value: 55.1099 + task: + type: STS + - dataset: + config: default + name: MTEB STSBenchmark (default) + revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831 + split: test + type: mteb/stsbenchmark-sts + metrics: + - type: pearson + value: 78.02329999999999 + - type: spearman + value: 79.1887 + - type: cosine_pearson + value: 78.02329999999999 + - type: cosine_spearman + value: 79.1887 + - type: manhattan_pearson + value: 78.8951 + - type: manhattan_spearman + value: 78.9444 + - type: euclidean_pearson + value: 79.1499 + - type: euclidean_spearman + value: 79.1888 + - type: main_score + value: 79.1887 + task: + type: STS + - dataset: + config: default + name: MTEB SciDocsRR (default) + revision: d3c5e1fc0b855ab6097bf1cda04dd73947d7caab + split: test + type: mteb/scidocs-reranking + metrics: + - type: map + value: 78.7501 + - type: mrr + value: 93.9748 + - type: nAUC_map_max + value: 54.495599999999996 + - type: nAUC_map_std + value: 70.0377 + - type: nAUC_map_diff1 + value: 6.0146999999999995 + - type: nAUC_mrr_max + value: 81.1486 + - type: nAUC_mrr_std + value: 78.3478 + - type: nAUC_mrr_diff1 + value: 50.7613 + - type: main_score + value: 78.7501 + task: + type: Reranking + - dataset: + config: default + name: MTEB SciFact (default) + revision: 0228b52cf27578f30900b9e5271d331663a030d7 + split: test + type: mteb/scifact + metrics: + - type: ndcg_at_1 + value: 58.667 + - type: ndcg_at_3 + value: 66.022 + - type: ndcg_at_5 + value: 68.508 + - type: ndcg_at_10 + value: 70.586 + - type: ndcg_at_20 + value: 71.714 + - type: ndcg_at_100 + value: 72.81 + - type: ndcg_at_1000 + value: 73.482 + - type: map_at_1 + value: 55.594 + - type: map_at_3 + value: 63.2 + - type: map_at_5 + value: 64.996 + - type: map_at_10 + value: 65.988 + - type: map_at_20 + value: 66.347 + - type: map_at_100 + value: 66.526 + - type: map_at_1000 + value: 66.547 + - type: recall_at_1 + value: 55.594 + - type: recall_at_3 + value: 71.22800000000001 + - type: recall_at_5 + value: 77.078 + - type: recall_at_10 + value: 83.172 + - type: recall_at_20 + value: 87.422 + - type: recall_at_100 + value: 93.167 + - type: recall_at_1000 + value: 98.667 + - type: precision_at_1 + value: 58.667 + - type: precision_at_3 + value: 25.778000000000002 + - type: precision_at_5 + value: 17.333000000000002 + - type: precision_at_10 + value: 9.433 + - type: precision_at_20 + value: 4.967 + - type: precision_at_100 + value: 1.06 + - type: precision_at_1000 + value: 0.11199999999999999 + - type: mrr_at_1 + value: 58.666700000000006 + - type: mrr_at_3 + value: 65.3889 + - type: mrr_at_5 + value: 66.62219999999999 + - type: mrr_at_10 + value: 67.3364 + - type: mrr_at_20 + value: 67.6046 + - type: mrr_at_100 + value: 67.73320000000001 + - type: mrr_at_1000 + value: 67.7526 + - type: nauc_ndcg_at_1_max + value: 60.2511 + - type: nauc_ndcg_at_1_std + value: 12.422 + - type: nauc_ndcg_at_1_diff1 + value: 74.4289 + - type: nauc_ndcg_at_3_max + value: 60.2109 + - type: nauc_ndcg_at_3_std + value: 11.0152 + - type: nauc_ndcg_at_3_diff1 + value: 71.0436 + - type: nauc_ndcg_at_5_max + value: 62.690999999999995 + - type: nauc_ndcg_at_5_std + value: 13.585700000000001 + - type: nauc_ndcg_at_5_diff1 + value: 70.4007 + - type: nauc_ndcg_at_10_max + value: 62.740899999999996 + - type: nauc_ndcg_at_10_std + value: 13.980400000000001 + - type: nauc_ndcg_at_10_diff1 + value: 70.0506 + - type: nauc_ndcg_at_20_max + value: 62.271699999999996 + - type: nauc_ndcg_at_20_std + value: 15.9756 + - type: nauc_ndcg_at_20_diff1 + value: 70.3237 + - type: nauc_ndcg_at_100_max + value: 62.125 + - type: nauc_ndcg_at_100_std + value: 15.5809 + - type: nauc_ndcg_at_100_diff1 + value: 70.4151 + - type: nauc_ndcg_at_1000_max + value: 61.9259 + - type: nauc_ndcg_at_1000_std + value: 15.3462 + - type: nauc_ndcg_at_1000_diff1 + value: 70.7346 + - type: nauc_map_at_1_max + value: 53.6767 + - type: nauc_map_at_1_std + value: 3.7751 + - type: nauc_map_at_1_diff1 + value: 74.60329999999999 + - type: nauc_map_at_3_max + value: 57.0403 + - type: nauc_map_at_3_std + value: 8.2272 + - type: nauc_map_at_3_diff1 + value: 71.7906 + - type: nauc_map_at_5_max + value: 59.6713 + - type: nauc_map_at_5_std + value: 10.8346 + - type: nauc_map_at_5_diff1 + value: 71.3356 + - type: nauc_map_at_10_max + value: 60.0086 + - type: nauc_map_at_10_std + value: 11.4394 + - type: nauc_map_at_10_diff1 + value: 71.14869999999999 + - type: nauc_map_at_20_max + value: 59.940599999999996 + - type: nauc_map_at_20_std + value: 12.0728 + - type: nauc_map_at_20_diff1 + value: 71.31 + - type: nauc_map_at_100_max + value: 59.95589999999999 + - type: nauc_map_at_100_std + value: 12.148299999999999 + - type: nauc_map_at_100_diff1 + value: 71.2142 + - type: nauc_map_at_1000_max + value: 59.9486 + - type: nauc_map_at_1000_std + value: 12.139 + - type: nauc_map_at_1000_diff1 + value: 71.2225 + - type: nauc_recall_at_1_max + value: 53.6767 + - type: nauc_recall_at_1_std + value: 3.7751 + - type: nauc_recall_at_1_diff1 + value: 74.60329999999999 + - type: nauc_recall_at_3_max + value: 60.4078 + - type: nauc_recall_at_3_std + value: 9.038300000000001 + - type: nauc_recall_at_3_diff1 + value: 67.60119999999999 + - type: nauc_recall_at_5_max + value: 68.0179 + - type: nauc_recall_at_5_std + value: 16.061600000000002 + - type: nauc_recall_at_5_diff1 + value: 65.54759999999999 + - type: nauc_recall_at_10_max + value: 68.7372 + - type: nauc_recall_at_10_std + value: 16.8637 + - type: nauc_recall_at_10_diff1 + value: 62.7613 + - type: nauc_recall_at_20_max + value: 67.1403 + - type: nauc_recall_at_20_std + value: 31.3919 + - type: nauc_recall_at_20_diff1 + value: 62.66929999999999 + - type: nauc_recall_at_100_max + value: 68.6366 + - type: nauc_recall_at_100_std + value: 32.4577 + - type: nauc_recall_at_100_diff1 + value: 64.52029999999999 + - type: nauc_recall_at_1000_max + value: 70.7166 + - type: nauc_recall_at_1000_std + value: 70.47149999999999 + - type: nauc_recall_at_1000_diff1 + value: 85.58590000000001 + - type: nauc_precision_at_1_max + value: 60.2511 + - type: nauc_precision_at_1_std + value: 12.422 + - type: nauc_precision_at_1_diff1 + value: 74.4289 + - type: nauc_precision_at_3_max + value: 58.75280000000001 + - type: nauc_precision_at_3_std + value: 27.605400000000003 + - type: nauc_precision_at_3_diff1 + value: 49.1523 + - type: nauc_precision_at_5_max + value: 56.4694 + - type: nauc_precision_at_5_std + value: 39.080799999999996 + - type: nauc_precision_at_5_diff1 + value: 28.8162 + - type: nauc_precision_at_10_max + value: 48.8894 + - type: nauc_precision_at_10_std + value: 43.8149 + - type: nauc_precision_at_10_diff1 + value: 15.0093 + - type: nauc_precision_at_20_max + value: 41.4059 + - type: nauc_precision_at_20_std + value: 50.7143 + - type: nauc_precision_at_20_diff1 + value: 8.3552 + - type: nauc_precision_at_100_max + value: 33.5064 + - type: nauc_precision_at_100_std + value: 52.8775 + - type: nauc_precision_at_100_diff1 + value: -5.0870999999999995 + - type: nauc_precision_at_1000_max + value: 23.9064 + - type: nauc_precision_at_1000_std + value: 57.784800000000004 + - type: nauc_precision_at_1000_diff1 + value: -20.1246 + - type: nauc_mrr_at_1_max + value: 60.2511 + - type: nauc_mrr_at_1_std + value: 12.422 + - type: nauc_mrr_at_1_diff1 + value: 74.4289 + - type: nauc_mrr_at_3_max + value: 62.663199999999996 + - type: nauc_mrr_at_3_std + value: 14.7348 + - type: nauc_mrr_at_3_diff1 + value: 72.1185 + - type: nauc_mrr_at_5_max + value: 63.3871 + - type: nauc_mrr_at_5_std + value: 15.773000000000001 + - type: nauc_mrr_at_5_diff1 + value: 71.6722 + - type: nauc_mrr_at_10_max + value: 62.8474 + - type: nauc_mrr_at_10_std + value: 15.1896 + - type: nauc_mrr_at_10_diff1 + value: 71.64110000000001 + - type: nauc_mrr_at_20_max + value: 62.699400000000004 + - type: nauc_mrr_at_20_std + value: 15.554499999999999 + - type: nauc_mrr_at_20_diff1 + value: 71.6049 + - type: nauc_mrr_at_100_max + value: 62.6665 + - type: nauc_mrr_at_100_std + value: 15.4586 + - type: nauc_mrr_at_100_diff1 + value: 71.6217 + - type: nauc_mrr_at_1000_max + value: 62.6641 + - type: nauc_mrr_at_1000_std + value: 15.4535 + - type: nauc_mrr_at_1000_diff1 + value: 71.6307 + - type: main_score + value: 70.586 + task: + type: Retrieval + - dataset: + config: default + name: MTEB SprintDuplicateQuestions (default) + revision: d66bd1f72af766a5cc4b0ca5e00c162f89e8cc46 + split: test + type: mteb/sprintduplicatequestions-pairclassification + metrics: + - type: similarity_accuracy + value: 99.8416 + - type: similarity_accuracy_threshold + value: 74.52069999999999 + - type: similarity_f1 + value: 92.008 + - type: similarity_f1_threshold + value: 74.4529 + - type: similarity_precision + value: 91.9162 + - type: similarity_recall + value: 92.10000000000001 + - type: similarity_ap + value: 96.54209999999999 + - type: cosine_accuracy + value: 99.8416 + - type: cosine_accuracy_threshold + value: 74.52069999999999 + - type: cosine_f1 + value: 92.008 + - type: cosine_f1_threshold + value: 74.4529 + - type: cosine_precision + value: 91.9162 + - type: cosine_recall + value: 92.10000000000001 + - type: cosine_ap + value: 96.54209999999999 + - type: manhattan_accuracy + value: 99.8446 + - type: manhattan_accuracy_threshold + value: 1784.866 + - type: manhattan_f1 + value: 92.1539 + - type: manhattan_f1_threshold + value: 1787.6774 + - type: manhattan_precision + value: 92.1079 + - type: manhattan_recall + value: 92.2 + - type: manhattan_ap + value: 96.5207 + - type: euclidean_accuracy + value: 99.8416 + - type: euclidean_accuracy_threshold + value: 71.3853 + - type: euclidean_f1 + value: 92.008 + - type: euclidean_f1_threshold + value: 71.4803 + - type: euclidean_precision + value: 91.9162 + - type: euclidean_recall + value: 92.10000000000001 + - type: euclidean_ap + value: 96.54209999999999 + - type: dot_accuracy + value: 99.8416 + - type: dot_accuracy_threshold + value: 74.52069999999999 + - type: dot_f1 + value: 92.008 + - type: dot_f1_threshold + value: 74.4528 + - type: dot_precision + value: 91.9162 + - type: dot_recall + value: 92.10000000000001 + - type: dot_ap + value: 96.54209999999999 + - type: max_accuracy + value: 99.8446 + - type: max_f1 + value: 92.1539 + - type: max_precision + value: 92.1079 + - type: max_recall + value: 92.2 + - type: max_ap + value: 96.54209999999999 + - type: main_score + value: 96.54209999999999 + task: + type: PairClassification + - dataset: + config: default + name: MTEB StackExchangeClustering (default) + revision: 6cbc1f7b2bc0622f2e39d2c77fa502909748c259 + split: test + type: mteb/stackexchange-clustering + metrics: + - type: v_measure + value: 63.4035 + - type: v_measure_std + value: 4.758 + - type: main_score + value: 63.4035 + task: + type: Clustering + - dataset: + config: default + name: MTEB StackExchangeClusteringP2P (default) + revision: 815ca46b2622cec33ccafc3735d572c266efdb44 + split: test + type: mteb/stackexchange-clustering-p2p + metrics: + - type: v_measure + value: 36.288599999999995 + - type: v_measure_std + value: 1.3107 + - type: main_score + value: 36.288599999999995 + task: + type: Clustering + - dataset: + config: default + name: MTEB StackOverflowDupQuestions (default) + revision: e185fbe320c72810689fc5848eb6114e1ef5ec69 + split: test + type: mteb/stackoverflowdupquestions-reranking + metrics: + - type: map + value: 51.457699999999996 + - type: mrr + value: 52.374500000000005 + - type: nAUC_map_max + value: 12.912399999999998 + - type: nAUC_map_std + value: 6.4524 + - type: nAUC_map_diff1 + value: 37.2785 + - type: nAUC_mrr_max + value: 13.333999999999998 + - type: nAUC_mrr_std + value: 7.0440000000000005 + - type: nAUC_mrr_diff1 + value: 37.2993 + - type: main_score + value: 51.457699999999996 + task: + type: Reranking + - dataset: + config: default + name: MTEB SummEval (default) + revision: cda12ad7615edc362dbf25a00fdd61d3b1eaf93c + split: test + type: mteb/summeval + metrics: + - type: pearson + value: 29.7101 + - type: spearman + value: 30.514200000000002 + - type: cosine_spearman + value: 30.514200000000002 + - type: cosine_pearson + value: 29.7101 + - type: dot_spearman + value: 30.514200000000002 + - type: dot_pearson + value: 29.7101 + - type: main_score + value: 30.514200000000002 + task: + type: Summarization + - dataset: + config: default + name: MTEB TRECCOVID (default) + revision: bb9466bac8153a0349341eb1b22e06409e78ef4e + split: test + type: mteb/trec-covid + metrics: + - type: ndcg_at_1 + value: 86.0 + - type: ndcg_at_3 + value: 86.542 + - type: ndcg_at_5 + value: 85.297 + - type: ndcg_at_10 + value: 83.866 + - type: ndcg_at_20 + value: 80.553 + - type: ndcg_at_100 + value: 65.091 + - type: ndcg_at_1000 + value: 57.86900000000001 + - type: map_at_1 + value: 0.23500000000000001 + - type: map_at_3 + value: 0.7100000000000001 + - type: map_at_5 + value: 1.1440000000000001 + - type: map_at_10 + value: 2.185 + - type: map_at_20 + value: 4.004 + - type: map_at_100 + value: 13.25 + - type: map_at_1000 + value: 32.668 + - type: recall_at_1 + value: 0.23500000000000001 + - type: recall_at_3 + value: 0.736 + - type: recall_at_5 + value: 1.191 + - type: recall_at_10 + value: 2.323 + - type: recall_at_20 + value: 4.390000000000001 + - type: recall_at_100 + value: 15.962000000000002 + - type: recall_at_1000 + value: 54.290000000000006 + - type: precision_at_1 + value: 90.0 + - type: precision_at_3 + value: 92.0 + - type: precision_at_5 + value: 90.0 + - type: precision_at_10 + value: 88.6 + - type: precision_at_20 + value: 85.5 + - type: precision_at_100 + value: 67.14 + - type: precision_at_1000 + value: 25.81 + - type: mrr_at_1 + value: 90.0 + - type: mrr_at_3 + value: 94.6667 + - type: mrr_at_5 + value: 94.6667 + - type: mrr_at_10 + value: 94.6667 + - type: mrr_at_20 + value: 94.6667 + - type: mrr_at_100 + value: 94.6667 + - type: mrr_at_1000 + value: 94.6667 + - type: nauc_ndcg_at_1_max + value: -0.0208 + - type: nauc_ndcg_at_1_std + value: 9.228200000000001 + - type: nauc_ndcg_at_1_diff1 + value: -7.4962 + - type: nauc_ndcg_at_3_max + value: 16.5755 + - type: nauc_ndcg_at_3_std + value: 39.0511 + - type: nauc_ndcg_at_3_diff1 + value: -14.5975 + - type: nauc_ndcg_at_5_max + value: 15.326799999999999 + - type: nauc_ndcg_at_5_std + value: 44.2523 + - type: nauc_ndcg_at_5_diff1 + value: -15.004600000000002 + - type: nauc_ndcg_at_10_max + value: 34.5609 + - type: nauc_ndcg_at_10_std + value: 62.8752 + - type: nauc_ndcg_at_10_diff1 + value: -22.9907 + - type: nauc_ndcg_at_20_max + value: 35.7633 + - type: nauc_ndcg_at_20_std + value: 74.1826 + - type: nauc_ndcg_at_20_diff1 + value: -26.3264 + - type: nauc_ndcg_at_100_max + value: 36.939499999999995 + - type: nauc_ndcg_at_100_std + value: 80.702 + - type: nauc_ndcg_at_100_diff1 + value: -41.7784 + - type: nauc_ndcg_at_1000_max + value: 41.3313 + - type: nauc_ndcg_at_1000_std + value: 68.0671 + - type: nauc_ndcg_at_1000_diff1 + value: -14.6009 + - type: nauc_map_at_1_max + value: -15.2873 + - type: nauc_map_at_1_std + value: -24.4781 + - type: nauc_map_at_1_diff1 + value: 35.4803 + - type: nauc_map_at_3_max + value: -14.107700000000001 + - type: nauc_map_at_3_std + value: -23.197699999999998 + - type: nauc_map_at_3_diff1 + value: 37.8596 + - type: nauc_map_at_5_max + value: -12.7588 + - type: nauc_map_at_5_std + value: -20.174400000000002 + - type: nauc_map_at_5_diff1 + value: 39.575700000000005 + - type: nauc_map_at_10_max + value: -4.8804 + - type: nauc_map_at_10_std + value: -11.0753 + - type: nauc_map_at_10_diff1 + value: 38.2457 + - type: nauc_map_at_20_max + value: 0.7396 + - type: nauc_map_at_20_std + value: 0.3599 + - type: nauc_map_at_20_diff1 + value: 35.4735 + - type: nauc_map_at_100_max + value: 20.011000000000003 + - type: nauc_map_at_100_std + value: 45.2654 + - type: nauc_map_at_100_diff1 + value: 3.6394 + - type: nauc_map_at_1000_max + value: 43.317099999999996 + - type: nauc_map_at_1000_std + value: 74.6629 + - type: nauc_map_at_1000_diff1 + value: -22.509 + - type: nauc_recall_at_1_max + value: -15.2873 + - type: nauc_recall_at_1_std + value: -24.4781 + - type: nauc_recall_at_1_diff1 + value: 35.4803 + - type: nauc_recall_at_3_max + value: -14.1509 + - type: nauc_recall_at_3_std + value: -24.7684 + - type: nauc_recall_at_3_diff1 + value: 40.6736 + - type: nauc_recall_at_5_max + value: -13.053899999999999 + - type: nauc_recall_at_5_std + value: -21.7134 + - type: nauc_recall_at_5_diff1 + value: 42.4446 + - type: nauc_recall_at_10_max + value: -7.3492 + - type: nauc_recall_at_10_std + value: -15.7989 + - type: nauc_recall_at_10_diff1 + value: 41.6543 + - type: nauc_recall_at_20_max + value: -4.8004 + - type: nauc_recall_at_20_std + value: -9.6834 + - type: nauc_recall_at_20_diff1 + value: 41.7323 + - type: nauc_recall_at_100_max + value: 11.3356 + - type: nauc_recall_at_100_std + value: 28.1118 + - type: nauc_recall_at_100_diff1 + value: 15.6166 + - type: nauc_recall_at_1000_max + value: 39.9341 + - type: nauc_recall_at_1000_std + value: 54.15410000000001 + - type: nauc_recall_at_1000_diff1 + value: -2.0016 + - type: nauc_precision_at_1_max + value: 12.2035 + - type: nauc_precision_at_1_std + value: 24.1923 + - type: nauc_precision_at_1_diff1 + value: -25.368800000000004 + - type: nauc_precision_at_3_max + value: 31.019600000000004 + - type: nauc_precision_at_3_std + value: 56.08539999999999 + - type: nauc_precision_at_3_diff1 + value: -33.821600000000004 + - type: nauc_precision_at_5_max + value: 26.127699999999997 + - type: nauc_precision_at_5_std + value: 52.8458 + - type: nauc_precision_at_5_diff1 + value: -22.24 + - type: nauc_precision_at_10_max + value: 45.8122 + - type: nauc_precision_at_10_std + value: 71.9086 + - type: nauc_precision_at_10_diff1 + value: -28.500700000000002 + - type: nauc_precision_at_20_max + value: 44.2567 + - type: nauc_precision_at_20_std + value: 80.86410000000001 + - type: nauc_precision_at_20_diff1 + value: -28.518 + - type: nauc_precision_at_100_max + value: 42.8044 + - type: nauc_precision_at_100_std + value: 84.13669999999999 + - type: nauc_precision_at_100_diff1 + value: -47.1098 + - type: nauc_precision_at_1000_max + value: 40.260200000000005 + - type: nauc_precision_at_1000_std + value: 53.53059999999999 + - type: nauc_precision_at_1000_diff1 + value: -41.2652 + - type: nauc_mrr_at_1_max + value: 12.2035 + - type: nauc_mrr_at_1_std + value: 24.1923 + - type: nauc_mrr_at_1_diff1 + value: -25.368800000000004 + - type: nauc_mrr_at_3_max + value: 16.8738 + - type: nauc_mrr_at_3_std + value: 28.113300000000002 + - type: nauc_mrr_at_3_diff1 + value: -20.3198 + - type: nauc_mrr_at_5_max + value: 16.8738 + - type: nauc_mrr_at_5_std + value: 28.113300000000002 + - type: nauc_mrr_at_5_diff1 + value: -20.3198 + - type: nauc_mrr_at_10_max + value: 16.8738 + - type: nauc_mrr_at_10_std + value: 28.113300000000002 + - type: nauc_mrr_at_10_diff1 + value: -20.3198 + - type: nauc_mrr_at_20_max + value: 16.8738 + - type: nauc_mrr_at_20_std + value: 28.113300000000002 + - type: nauc_mrr_at_20_diff1 + value: -20.3198 + - type: nauc_mrr_at_100_max + value: 16.8738 + - type: nauc_mrr_at_100_std + value: 28.113300000000002 + - type: nauc_mrr_at_100_diff1 + value: -20.3198 + - type: nauc_mrr_at_1000_max + value: 16.8738 + - type: nauc_mrr_at_1000_std + value: 28.113300000000002 + - type: nauc_mrr_at_1000_diff1 + value: -20.3198 + - type: main_score + value: 83.866 + task: + type: Retrieval + - dataset: + config: default + name: MTEB Touche2020 (default) + revision: a34f9a33db75fa0cbb21bb5cfc3dae8dc8bec93f + split: test + type: mteb/touche2020 + metrics: + - type: ndcg_at_1 + value: 38.775999999999996 + - type: ndcg_at_3 + value: 33.664 + - type: ndcg_at_5 + value: 31.61 + - type: ndcg_at_10 + value: 29.499 + - type: ndcg_at_20 + value: 29.772 + - type: ndcg_at_100 + value: 39.845000000000006 + - type: ndcg_at_1000 + value: 51.141999999999996 + - type: map_at_1 + value: 3.004 + - type: map_at_3 + value: 6.027 + - type: map_at_5 + value: 7.993 + - type: map_at_10 + value: 11.546 + - type: map_at_20 + value: 14.185 + - type: map_at_100 + value: 17.698 + - type: map_at_1000 + value: 19.364 + - type: recall_at_1 + value: 3.004 + - type: recall_at_3 + value: 7.178 + - type: recall_at_5 + value: 11.196 + - type: recall_at_10 + value: 18.584999999999997 + - type: recall_at_20 + value: 26.845999999999997 + - type: recall_at_100 + value: 49.025 + - type: recall_at_1000 + value: 82.884 + - type: precision_at_1 + value: 40.816 + - type: precision_at_3 + value: 33.333 + - type: precision_at_5 + value: 30.612000000000002 + - type: precision_at_10 + value: 25.714 + - type: precision_at_20 + value: 19.387999999999998 + - type: precision_at_100 + value: 7.939 + - type: precision_at_1000 + value: 1.545 + - type: mrr_at_1 + value: 40.8163 + - type: mrr_at_3 + value: 53.401399999999995 + - type: mrr_at_5 + value: 56.7687 + - type: mrr_at_10 + value: 57.5421 + - type: mrr_at_20 + value: 58.142 + - type: mrr_at_100 + value: 58.2307 + - type: mrr_at_1000 + value: 58.2307 + - type: nauc_ndcg_at_1_max + value: -18.0584 + - type: nauc_ndcg_at_1_std + value: -25.634600000000002 + - type: nauc_ndcg_at_1_diff1 + value: -1.7021000000000002 + - type: nauc_ndcg_at_3_max + value: -17.8622 + - type: nauc_ndcg_at_3_std + value: -20.119799999999998 + - type: nauc_ndcg_at_3_diff1 + value: -2.399 + - type: nauc_ndcg_at_5_max + value: -22.0829 + - type: nauc_ndcg_at_5_std + value: -22.841 + - type: nauc_ndcg_at_5_diff1 + value: -12.350200000000001 + - type: nauc_ndcg_at_10_max + value: -17.858999999999998 + - type: nauc_ndcg_at_10_std + value: -17.9067 + - type: nauc_ndcg_at_10_diff1 + value: -9.3129 + - type: nauc_ndcg_at_20_max + value: -24.479400000000002 + - type: nauc_ndcg_at_20_std + value: -16.06 + - type: nauc_ndcg_at_20_diff1 + value: -10.57 + - type: nauc_ndcg_at_100_max + value: -20.9167 + - type: nauc_ndcg_at_100_std + value: 9.6051 + - type: nauc_ndcg_at_100_diff1 + value: -0.2363 + - type: nauc_ndcg_at_1000_max + value: -13.6708 + - type: nauc_ndcg_at_1000_std + value: 17.956 + - type: nauc_ndcg_at_1000_diff1 + value: -2.5696 + - type: nauc_map_at_1_max + value: -14.276900000000001 + - type: nauc_map_at_1_std + value: -31.3091 + - type: nauc_map_at_1_diff1 + value: -1.4354 + - type: nauc_map_at_3_max + value: -21.7098 + - type: nauc_map_at_3_std + value: -32.112899999999996 + - type: nauc_map_at_3_diff1 + value: -8.846 + - type: nauc_map_at_5_max + value: -16.700200000000002 + - type: nauc_map_at_5_std + value: -32.643499999999996 + - type: nauc_map_at_5_diff1 + value: -13.9766 + - type: nauc_map_at_10_max + value: -13.415199999999999 + - type: nauc_map_at_10_std + value: -28.459200000000003 + - type: nauc_map_at_10_diff1 + value: -12.4042 + - type: nauc_map_at_20_max + value: -17.8629 + - type: nauc_map_at_20_std + value: -24.5837 + - type: nauc_map_at_20_diff1 + value: -14.9642 + - type: nauc_map_at_100_max + value: -15.6478 + - type: nauc_map_at_100_std + value: -11.4237 + - type: nauc_map_at_100_diff1 + value: -11.542 + - type: nauc_map_at_1000_max + value: -15.2149 + - type: nauc_map_at_1000_std + value: -8.0384 + - type: nauc_map_at_1000_diff1 + value: -12.984000000000002 + - type: nauc_recall_at_1_max + value: -14.276900000000001 + - type: nauc_recall_at_1_std + value: -31.3091 + - type: nauc_recall_at_1_diff1 + value: -1.4354 + - type: nauc_recall_at_3_max + value: -23.021900000000002 + - type: nauc_recall_at_3_std + value: -30.2834 + - type: nauc_recall_at_3_diff1 + value: -11.4226 + - type: nauc_recall_at_5_max + value: -20.596600000000002 + - type: nauc_recall_at_5_std + value: -33.219300000000004 + - type: nauc_recall_at_5_diff1 + value: -17.718999999999998 + - type: nauc_recall_at_10_max + value: -16.1214 + - type: nauc_recall_at_10_std + value: -23.9041 + - type: nauc_recall_at_10_diff1 + value: -11.047 + - type: nauc_recall_at_20_max + value: -25.603399999999997 + - type: nauc_recall_at_20_std + value: -15.8105 + - type: nauc_recall_at_20_diff1 + value: -14.546000000000001 + - type: nauc_recall_at_100_max + value: -16.389400000000002 + - type: nauc_recall_at_100_std + value: 28.5141 + - type: nauc_recall_at_100_diff1 + value: 6.1868 + - type: nauc_recall_at_1000_max + value: 11.022 + - type: nauc_recall_at_1000_std + value: 68.0021 + - type: nauc_recall_at_1000_diff1 + value: 8.426 + - type: nauc_precision_at_1_max + value: -17.1625 + - type: nauc_precision_at_1_std + value: -27.9451 + - type: nauc_precision_at_1_diff1 + value: 1.0831 + - type: nauc_precision_at_3_max + value: -17.2798 + - type: nauc_precision_at_3_std + value: -20.347199999999997 + - type: nauc_precision_at_3_diff1 + value: -5.2689 + - type: nauc_precision_at_5_max + value: -19.6408 + - type: nauc_precision_at_5_std + value: -24.157 + - type: nauc_precision_at_5_diff1 + value: -20.274900000000002 + - type: nauc_precision_at_10_max + value: -11.8033 + - type: nauc_precision_at_10_std + value: -7.2727 + - type: nauc_precision_at_10_diff1 + value: -9.3776 + - type: nauc_precision_at_20_max + value: -20.1541 + - type: nauc_precision_at_20_std + value: 9.0645 + - type: nauc_precision_at_20_diff1 + value: -16.1323 + - type: nauc_precision_at_100_max + value: 0.3701 + - type: nauc_precision_at_100_std + value: 67.6941 + - type: nauc_precision_at_100_diff1 + value: 8.0336 + - type: nauc_precision_at_1000_max + value: 38.8632 + - type: nauc_precision_at_1000_std + value: 38.0504 + - type: nauc_precision_at_1000_diff1 + value: 0.5907 + - type: nauc_mrr_at_1_max + value: -17.1625 + - type: nauc_mrr_at_1_std + value: -27.9451 + - type: nauc_mrr_at_1_diff1 + value: 1.0831 + - type: nauc_mrr_at_3_max + value: -20.479300000000002 + - type: nauc_mrr_at_3_std + value: -21.9225 + - type: nauc_mrr_at_3_diff1 + value: -1.5211000000000001 + - type: nauc_mrr_at_5_max + value: -24.8175 + - type: nauc_mrr_at_5_std + value: -23.805 + - type: nauc_mrr_at_5_diff1 + value: -7.9258 + - type: nauc_mrr_at_10_max + value: -22.53 + - type: nauc_mrr_at_10_std + value: -21.9391 + - type: nauc_mrr_at_10_diff1 + value: -5.7533 + - type: nauc_mrr_at_20_max + value: -22.7064 + - type: nauc_mrr_at_20_std + value: -22.4697 + - type: nauc_mrr_at_20_diff1 + value: -5.7068 + - type: nauc_mrr_at_100_max + value: -23.0016 + - type: nauc_mrr_at_100_std + value: -22.488 + - type: nauc_mrr_at_100_diff1 + value: -5.3738 + - type: nauc_mrr_at_1000_max + value: -23.0016 + - type: nauc_mrr_at_1000_std + value: -22.488 + - type: nauc_mrr_at_1000_diff1 + value: -5.3738 + - type: main_score + value: 29.499 + task: + type: Retrieval + - dataset: + config: default + name: MTEB ToxicConversationsClassification (default) + revision: edfaf9da55d3dd50d43143d90c1ac476895ae6de + split: test + type: mteb/toxic_conversations_50k + metrics: + - type: accuracy + value: 65.8643 + - type: f1 + value: 50.6764 + - type: f1_weighted + value: 73.2472 + - type: ap + value: 12.2658 + - type: ap_weighted + value: 12.2658 + - type: main_score + value: 65.8643 + task: + type: Classification + - dataset: + config: default + name: MTEB TweetSentimentExtractionClassification (default) + revision: d604517c81ca91fe16a244d1248fc021f9ecee7a + split: test + type: mteb/tweet_sentiment_extraction + metrics: + - type: accuracy + value: 59.6633 + - type: f1 + value: 59.935700000000004 + - type: f1_weighted + value: 59.0249 + - type: main_score + value: 59.6633 + task: + type: Classification + - dataset: + config: default + name: MTEB TwentyNewsgroupsClustering (default) + revision: 6125ec4e24fa026cec8a478383ee943acfbd5449 + split: test + type: mteb/twentynewsgroups-clustering + metrics: + - type: v_measure + value: 43.2311 + - type: v_measure_std + value: 2.3994999999999997 + - type: main_score + value: 43.2311 + task: + type: Clustering + - dataset: + config: default + name: MTEB TwitterSemEval2015 (default) + revision: 70970daeab8776df92f5ea462b6173c0b46fd2d1 + split: test + type: mteb/twittersemeval2015-pairclassification + metrics: + - type: similarity_accuracy + value: 83.8469 + - type: similarity_accuracy_threshold + value: 77.6695 + - type: similarity_f1 + value: 62.3159 + - type: similarity_f1_threshold + value: 71.6554 + - type: similarity_precision + value: 59.114599999999996 + - type: similarity_recall + value: 65.8839 + - type: similarity_ap + value: 67.00930000000001 + - type: cosine_accuracy + value: 83.8469 + - type: cosine_accuracy_threshold + value: 77.6695 + - type: cosine_f1 + value: 62.3159 + - type: cosine_f1_threshold + value: 71.6554 + - type: cosine_precision + value: 59.114599999999996 + - type: cosine_recall + value: 65.8839 + - type: cosine_ap + value: 67.00930000000001 + - type: manhattan_accuracy + value: 83.7694 + - type: manhattan_accuracy_threshold + value: 1677.8293999999999 + - type: manhattan_f1 + value: 62.1324 + - type: manhattan_f1_threshold + value: 1848.6641 + - type: manhattan_precision + value: 61.839999999999996 + - type: manhattan_recall + value: 62.4274 + - type: manhattan_ap + value: 66.8849 + - type: euclidean_accuracy + value: 83.8469 + - type: euclidean_accuracy_threshold + value: 66.8288 + - type: euclidean_f1 + value: 62.3159 + - type: euclidean_f1_threshold + value: 75.2922 + - type: euclidean_precision + value: 59.114599999999996 + - type: euclidean_recall + value: 65.8839 + - type: euclidean_ap + value: 67.00930000000001 + - type: dot_accuracy + value: 83.8469 + - type: dot_accuracy_threshold + value: 77.6695 + - type: dot_f1 + value: 62.3159 + - type: dot_f1_threshold + value: 71.6554 + - type: dot_precision + value: 59.114599999999996 + - type: dot_recall + value: 65.8839 + - type: dot_ap + value: 67.00930000000001 + - type: max_accuracy + value: 83.8469 + - type: max_f1 + value: 62.3159 + - type: max_precision + value: 61.839999999999996 + - type: max_recall + value: 65.8839 + - type: max_ap + value: 67.00930000000001 + - type: main_score + value: 67.00930000000001 + task: + type: PairClassification + - dataset: + config: default + name: MTEB TwitterURLCorpus (default) + revision: 8b6510b0b1fa4e4c4f879467980e9be563ec1cdf + split: test + type: mteb/twitterurlcorpus-pairclassification + metrics: + - type: similarity_accuracy + value: 88.8811 + - type: similarity_accuracy_threshold + value: 71.1053 + - type: similarity_f1 + value: 77.9005 + - type: similarity_f1_threshold + value: 67.5068 + - type: similarity_precision + value: 75.5609 + - type: similarity_recall + value: 80.3896 + - type: similarity_ap + value: 85.459 + - type: cosine_accuracy + value: 88.8811 + - type: cosine_accuracy_threshold + value: 71.1053 + - type: cosine_f1 + value: 77.9005 + - type: cosine_f1_threshold + value: 67.5068 + - type: cosine_precision + value: 75.5609 + - type: cosine_recall + value: 80.3896 + - type: cosine_ap + value: 85.459 + - type: manhattan_accuracy + value: 88.8598 + - type: manhattan_accuracy_threshold + value: 1928.9173 + - type: manhattan_f1 + value: 77.9172 + - type: manhattan_f1_threshold + value: 2007.8883999999998 + - type: manhattan_precision + value: 76.29310000000001 + - type: manhattan_recall + value: 79.6119 + - type: manhattan_ap + value: 85.4464 + - type: euclidean_accuracy + value: 88.8811 + - type: euclidean_accuracy_threshold + value: 76.0193 + - type: euclidean_f1 + value: 77.9005 + - type: euclidean_f1_threshold + value: 80.6141 + - type: euclidean_precision + value: 75.5609 + - type: euclidean_recall + value: 80.3896 + - type: euclidean_ap + value: 85.459 + - type: dot_accuracy + value: 88.8811 + - type: dot_accuracy_threshold + value: 71.1053 + - type: dot_f1 + value: 77.9005 + - type: dot_f1_threshold + value: 67.5068 + - type: dot_precision + value: 75.5609 + - type: dot_recall + value: 80.3896 + - type: dot_ap + value: 85.459 + - type: max_accuracy + value: 88.8811 + - type: max_f1 + value: 77.9172 + - type: max_precision + value: 76.29310000000001 + - type: max_recall + value: 80.3896 + - type: max_ap + value: 85.459 + - type: main_score + value: 85.459 + task: + type: PairClassification +--- + +

Snowflake's Arctic-embed-l-v2.0

+

+

+ News | + Models | + Usage | + Evaluation | + Contact | + FAQ + License | + Acknowledgement +

+

+ + + +## News +- 12/11/2024: Release of [Technical Report](https://arxiv.org/abs/2412.04506) +- 12/04/2024: Release of [snowflake-arctic-embed-l-v2.0](https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0) and [snowflake-arctic-embed-m-v2.0](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v2.0) our newest models with multilingual workloads in mind. + +## Models +Snowflake arctic-embed-l-v2.0 is the newest addition to the suite of embedding models Snowflake has released optimizing for retrieval performance and inference efficiency. +Arctic Embed 2.0 introduces a new standard for multilingual embedding models, combining high-quality multilingual text retrieval without sacrificing performance in English. +Released under the permissive Apache 2.0 license, Arctic Embed 2.0 is ideal for applications that demand reliable, enterprise-grade multilingual search and retrieval at scale. + +Key Features: + +1. Multilingual without compromise: Excels in English and non-English retrieval, outperforming leading open-source and proprietary models on benchmarks like MTEB Retrieval, CLEF, and MIRACL. + +2. Inference efficiency: Its 303m non-embedding parameters inference is fast and efficient for any scale. + +3. Compression-friendly: Achieves high-quality retrieval with embeddings as small as 128 bytes/vector using Matryoshka Representation Learning (MRL) and quantization-aware embedding training. **Please note that like our v1.5 model, the MRL for this model is 256 dimensions, and high-quality 128-byte compression is achieved via 4-bit quantization (e.g. using a [`pq256x4fs` fast-scan FAISS index](https://github.com/facebookresearch/faiss/wiki/The-index-factory#encodings) or using the [example code published alongside our 1.5 model](https://github.com/Snowflake-Labs/arctic-embed/blob/main/compressed_embeddings_examples/score_arctic_embed_m_v1dot5_with_quantization.ipynb)).** + +4. Drop-In Replacement: arctic-embed-l-v2.0 builds on [BAAI/bge-m3-retromae](https://huggingface.co/BAAI/bge-m3-retromae) which allows direct drop-in inference replacement with any form of new libraries, kernels, inference engines etc. + +5. Long Context Support: arctic-embed-l-v2.0 builds on [BAAI/bge-m3-retromae](https://huggingface.co/BAAI/bge-m3-retromae) which can support a context window of up to 8192 via the use of RoPE. + + +### Quality Benchmarks +Unlike most other open-source models, Arctic-embed-l-v2.0 excels across English (via MTEB Retrieval) and multilingual (via MIRACL and CLEF). +You no longer need to support models to empower high-quality English and multilingual retrieval. All numbers mentioned below are the average NDCG@10 across the dataset being discussed. + +| Model Name | # params | # non-emb params | # dimensions | BEIR (15) | MIRACL (4) | CLEF (Focused) | CLEF (Full) | +|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:| +| **snowflake-arctic-l-v2.0** | 568M | 303M | 1024 | **55.6** | 55.8 | **52.9** | **54.3** | +| snowflake-arctic-m | 109M | 86M | 768 | 54.9 | 24.9 | 34.4 | 29.1 | +| snowflake-arctic-l | 335M | 303M | 1024 | 56.0 | 34.8 | 38.2 | 33.7 | +| me5 base | 560M | 303M | 1024 | 51.4 | 54.0 | 43.0 | 34.6 | +| bge-m3 (BAAI) | 568M | 303M | 1024 | 48.8 | **56.8** | 40.8 | 41.3 | +| gte (Alibaba) | 305M | 113M | 768 | 51.1 | 52.3 | 47.7 | 53.1 | + +Aside from high-quality retrieval arctic delivers embeddings that are easily compressible. Leverage vector truncation via MRL to decrease vector size by 4x with less than 3% degredation in quality. +Combine MRLed vectors with vector compression (Int4) to power retrieval in 128 bytes per doc. + +| Model | | BEIR (15) | Relative Performance | MIRACL (4) | Relative Performance | CLEF (5) | Relative Performance | CLEF (Full) | Relative Performance | +|---|---|:---:|:---:|:---:|:---:|:---:|---|---|---| +| snowflake-arctic-l-v2.0 | 1024 | 55.6 | N/A | 55.8 | N/A | 52.9 | N/A | 54.3 | N/A | +| snowflake-arctic-l-v2.0 | 256 | 54.3 | -0.18% | 54.3 | -2.70% | 51.9 | -1.81% | 53.4 | -1.53% | + +## Usage + +### Using Sentence Transformers + +```python +from sentence_transformers import SentenceTransformer + +# Load the model +model_name = 'Snowflake/snowflake-arctic-embed-l-v2.0' +model = SentenceTransformer(model_name) + +# Define the queries and documents +queries = ['what is snowflake?', 'Where can I get the best tacos?'] +documents = ['The Data Cloud!', 'Mexico City of Course!'] + +# Compute embeddings: use `prompt_name="query"` to encode queries! +query_embeddings = model.encode(queries, prompt_name="query") +document_embeddings = model.encode(documents) + +# Compute cosine similarity scores +scores = model.similarity(query_embeddings, document_embeddings) + +# Output the results +for query, query_scores in zip(queries, scores): + doc_score_pairs = list(zip(documents, query_scores)) + doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True) + print("Query:", query) + for document, score in doc_score_pairs: + print(score, document) + +``` + + + +### Using Huggingface Transformers + + +You can use the transformers package to use Snowflake's arctic-embed model, as shown below. For optimal retrieval quality, use the CLS token to embed each text portion and use the query prefix below (just on the query). + +```python +import torch +from transformers import AutoModel, AutoTokenizer + +model_name = 'Snowflake/snowflake-arctic-embed-l-v2.0' +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModel.from_pretrained(model_name, add_pooling_layer=False) +model.eval() + +query_prefix = 'query: ' +queries = ['what is snowflake?', 'Where can I get the best tacos?'] +queries_with_prefix = ["{}{}".format(query_prefix, i) for i in queries] +query_tokens = tokenizer(queries_with_prefix, padding=True, truncation=True, return_tensors='pt', max_length=8192) + +documents = ['The Data Cloud!', 'Mexico City of Course!'] +document_tokens = tokenizer(documents, padding=True, truncation=True, return_tensors='pt', max_length=8192) + +# Compute token embeddings +with torch.no_grad(): + query_embeddings = model(**query_tokens)[0][:, 0] + document_embeddings = model(**document_tokens)[0][:, 0] + + +# normalize embeddings +query_embeddings = torch.nn.functional.normalize(query_embeddings, p=2, dim=1) +document_embeddings = torch.nn.functional.normalize(document_embeddings, p=2, dim=1) + +scores = torch.mm(query_embeddings, document_embeddings.transpose(0, 1)) +for query, query_scores in zip(queries, scores): + doc_score_pairs = list(zip(documents, query_scores)) + doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True) + #Output passages & scores + print("Query:", query) + for document, score in doc_score_pairs: + print(score, document) +``` + + +This should produce the following scores + +``` +Query: what is snowflake? +tensor(0.2715) The Data Cloud! +tensor(0.0661) Mexico City of Course! +Query: Where can I get the best tacos? +tensor(0.2797) Mexico City of Course! +tensor(0.1250) The Data Cloud! +``` + +### Using Huggingface Transformers.js + +If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using: +```bash +npm i @huggingface/transformers +``` + +You can then use the model for retrieval, as follows: + +```js +import { pipeline, dot } from '@huggingface/transformers'; + +// Create feature extraction pipeline +const extractor = await pipeline('feature-extraction', 'Snowflake/snowflake-arctic-embed-m-v2.0', { + dtype: 'q8', +}); + +// Generate sentence embeddings +const sentences = [ + 'query: what is snowflake?', + 'The Data Cloud!', + 'Mexico City of Course!', +] +const output = await extractor(sentences, { normalize: true, pooling: 'cls' }); + +// Compute similarity scores +const [source_embeddings, ...document_embeddings ] = output.tolist(); +const similarities = document_embeddings.map(x => dot(source_embeddings, x)); +console.log(similarities); // [0.24783534471401417, 0.05313122704326892] +``` + + +## Contact + + +Feel free to open an issue or pull request if you have any questions or suggestions about this project. +You also can email Daniel Campos(daniel.campos@snowflake.com). + + +## License +Arctic is licensed under the [Apache-2](https://www.apache.org/licenses/LICENSE-2.0). The released models can be used for commercial purposes free of charge. \ No newline at end of file diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/config.json b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/config.json new file mode 100644 index 0000000000000000000000000000000000000000..712fdc7164d3eaa5491b39aadaf328669e4566fa --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/config.json @@ -0,0 +1,30 @@ +{ + "architectures": [ + "XLMRobertaModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "classifier_dropout": null, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "matryoshka_dimensions": [ + 256 + ], + "max_position_embeddings": 8194, + "model_type": "xlm-roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "torch_dtype": "bfloat16", + "transformers_version": "4.53.2", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/config_sentence_transformers.json b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/config_sentence_transformers.json new file mode 100644 index 0000000000000000000000000000000000000000..3f6705f4122c2fe74af25e930a704429b993a6ea --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/config_sentence_transformers.json @@ -0,0 +1,14 @@ +{ + "__version__": { + "sentence_transformers": "5.1.1", + "transformers": "4.53.2", + "pytorch": "2.9.0+cu128" + }, + "prompts": { + "query": "query: ", + "document": "" + }, + "default_prompt_name": null, + "model_type": "SentenceTransformer", + "similarity_fn_name": "cosine" +} \ No newline at end of file diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/model.safetensors b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8da9214760679c3a5a6c8c22e0494e5ed561535f --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec8cf057e51daae41e5846c7b8b5f6199528bf0cdc16802f9aa7616ae85c4e7b +size 1135554736 diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/modules.json b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/modules.json new file mode 100644 index 0000000000000000000000000000000000000000..952a9b81c0bfd99800fabf352f69c7ccd46c5e43 --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/sentence_bert_config.json b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/sentence_bert_config.json new file mode 100644 index 0000000000000000000000000000000000000000..306f5e30b3047fbad6af657cae7db9b911d72216 --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 8192, + "do_lower_case": false +} \ No newline at end of file diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/sentencepiece.bpe.model b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/special_tokens_map.json b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b1879d702821e753ffe4245048eee415d54a9385 --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/tokenizer.json b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..225ca0f803c45cce8b17cafe2dde2c7682e91938 --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9a6af42442a3e3e9f05f618eae0bb2d98ca4f6a6406cb80ef7a4fa865204d61 +size 17083052 diff --git a/victord/sub19/models/snowflake-arctic-embed-l-v2.0/tokenizer_config.json b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..68ac76ef77cee8a9cab2bb89c94e852edca5a5d8 --- /dev/null +++ b/victord/sub19/models/snowflake-arctic-embed-l-v2.0/tokenizer_config.json @@ -0,0 +1,62 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "extra_special_tokens": {}, + "mask_token": "", + "max_length": 512, + "model_max_length": 8192, + "pad_to_multiple_of": null, + "pad_token": "", + "pad_token_type_id": 0, + "padding_side": "right", + "sep_token": "", + "stride": 0, + "tokenizer_class": "XLMRobertaTokenizer", + "truncation_side": "right", + "truncation_strategy": "longest_first", + "unk_token": "" +}