File size: 1,745 Bytes
d992912
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re
from collections import Counter
from typing import Dict, List

import numpy as np

__all__ = ["SimpleBM25"]


class SimpleBM25:
    def __init__(self, k1: float = 1.5, b: float = 0.75):
        self.k1 = k1
        self.b = b
        self.doc_tokens: List[List[str]] = []
        self.avg_dl: float = 0
        self.df: Dict[str, int] = {}
        self.n_docs: int = 0

    def fit(self, documents: List[str]):
        self.doc_tokens = [self._tokenize(d) for d in documents]
        self.n_docs = len(self.doc_tokens)
        self.avg_dl = np.mean([len(t) for t in self.doc_tokens]) if self.doc_tokens else 1
        self.df = Counter()
        for tokens in self.doc_tokens:
            for t in set(tokens):
                self.df[t] += 1

    def _tokenize(self, text: str) -> List[str]:
        return re.findall(r'\b[a-z]+\b', str(text).lower())

    def score_candidates(self, query: str, candidate_indices: List[int]) -> np.ndarray:
        q_tokens = self._tokenize(query)
        scores = np.zeros(len(candidate_indices), dtype=np.float32)
        for i, doc_idx in enumerate(candidate_indices):
            if doc_idx >= len(self.doc_tokens):
                continue
            doc = self.doc_tokens[doc_idx]
            dl = len(doc)
            tf_doc = Counter(doc)
            s = 0.0
            for qt in q_tokens:
                if qt not in self.df:
                    continue
                tf = tf_doc.get(qt, 0)
                idf = np.log((self.n_docs - self.df[qt] + 0.5) / (self.df[qt] + 0.5) + 1)
                num = tf * (self.k1 + 1)
                den = tf + self.k1 * (1 - self.b + self.b * dl / self.avg_dl)
                s += idf * num / den
            scores[i] = s
        return scores