In [1]:
!python settings.py

Using device: cuda


In [None]:
import os
import json
import pandas as pd
from pprint import pprint
from tqdm.autonotebook import tqdm

from sentence_transformers import SentenceTransformer

from mteb import MTEB
from mteb.abstasks.TaskMetadata import TaskMetadata
from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval

from settings import MODEL_NAME, OUTPUT_DIR, DEVICE, BATCH_SIZE
os.environ['WANDB_DISABLED'] = 'true'

In [4]:
data = {
    'corpus': pd.read_parquet('data/processed/corpus_data.parquet'),
    'train' : pd.read_parquet('data/processed/train_data.parquet'),
    'test'  : pd.read_parquet('data/processed/test_data.parquet')
}
for split in ['train', 'test']:
    data[split]['cid']          = data[split]['cid'].apply(lambda x: x.tolist())
    data[split]['context_list'] = data[split]['context_list'].apply(lambda x: x.tolist())

In [5]:
class BKAILegalDocRetrievalTask(AbsTaskRetrieval):
    # Metadata definition used by MTEB benchmark
    metadata = TaskMetadata(name='BKAILegalDocRetrieval',
                            description='',
                            reference='https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_dataset.md',
                            type='Retrieval',
                            category='s2p',
                            modalities=['text'],
                            eval_splits=['test'],
                            eval_langs=['vi'],
                            main_score='ndcg_at_10',
                            other_scores=['recall_at_10', 'precision_at_10', 'map'],
                            dataset={
                                'path'    : 'data',
                                'revision': 'd4c5a8ba10ae71224752c727094ac4c46947fa29',
                            },
                            date=('2012-01-01', '2020-01-01'),
                            form='Written',
                            domains=['Academic', 'Non-fiction'],
                            task_subtypes=['Scientific Reranking'],
                            license='cc-by-nc-4.0',
                            annotations_creators='derived',
                            dialect=[],
                            text_creation='found',
                            bibtex_citation=''
    )

    data_loaded = True # Flag

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.corpus        = {}
        self.queries       = {}
        self.relevant_docs = {}

        shared_corpus = {}
        for _, row in data['corpus'].iterrows():
            shared_corpus[f"c{row['cid']}"] = {
                'text': row['text'],
                '_id' : row['cid']
            }
            
        for split in ['train', 'test']:
            self.corpus[split]        = shared_corpus
            self.queries[split]       = {}
            self.relevant_docs[split] = {}

        for split in ['train', 'test']:
            for _, row in data[split].iterrows():
                qid, cids = row['qid'], row['cid']
                
                qid_str   = f'q{qid}'
                cids_str  = [f'c{cid}' for cid in cids]
                
                self.queries[split][qid_str] = row['question']
                
                if qid_str not in self.relevant_docs[split]:
                    self.relevant_docs[split][qid_str] = {}
                    
                for cid_str in cids_str:
                    self.relevant_docs[split][qid_str][cid_str] = 1
            
        self.data_loaded = True

In [6]:
fine_tuned_model = SentenceTransformer(OUTPUT_DIR, device=DEVICE)

In [7]:
custom_task = BKAILegalDocRetrievalTask()
evaluation  = MTEB(tasks=[custom_task])
evaluation.run(fine_tuned_model, batch_size=BATCH_SIZE)

The `batch_size` argument is deprecated and will be removed in the next release. Please use `encode_kwargs = {'batch_size': ...}` to set the batch size instead.


Batches:   0%|          | 0/233 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:02<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:02<?, ?it/s]

Batches:   0%|          | 0/91 [00:02<?, ?it/s]

[TaskResult(task_name=BKAILegalDocRetrieval, scores=...)]

In [8]:
file_path = f"results/no_model_name_available/no_revision_available/BKAILegalDocRetrieval.json"

with open(file_path, 'r', encoding='utf-8') as f:
    eval_data = json.load(f)

scores = eval_data["scores"]["test"][0]
main_metrics = {
    'main_score'         : scores.get('ndcg_at_10'),
    'recall@10'          : scores.get('recall_at_10'),
    'precision@10'       : scores.get('precision_at_10'),
    'mrr@10'             : scores.get('mrr_at_10'),
    'evaluation_time (s)': eval_data.get('evaluation_time')
}

print('Main Evaluation Metrics (Top-K = 10):')
pprint(main_metrics)

Main Evaluation Metrics (Top-K = 10):
{'evaluation_time (s)': 3061.7869832515717,
 'main_score': 0.60389,
 'mrr@10': 0.555102,
 'precision@10': 0.08587,
 'recall@10': 0.79407}


In [9]:
metrics = {k: v for k, v in scores.items() if '_at_' in k and not k.startswith('nauc')}

parsed_metrics = []
for key, value in metrics.items():
    metric, at_k = key.split('_at_')
    parsed_metrics.append({'metric': metric, 'k': int(at_k), 'score': value})

df_metrics = pd.DataFrame(parsed_metrics).pivot(index='k', columns='metric', values='score')
df_metrics = df_metrics.sort_index()

print("\nEvaluation Scores by K:")
print(df_metrics.round(4))


Evaluation Scores by K:
metric     map     mrr    ndcg  precision  recall
k                                                
1       0.4033  0.4242  0.4242     0.4242  0.4033
3       0.5031  0.5247  0.5394     0.2215  0.6232
5       0.5230  0.5434  0.5739     0.1512  0.7047
10      0.5361  0.5551  0.6039     0.0859  0.7941
20      0.5414  0.5596  0.6216     0.0469  0.8611
100     0.5442  0.5617  0.6389     0.0104  0.9480
1000    0.5444  0.5619  0.6444     0.0011  0.9879
