File size: 11,121 Bytes
fcf5c6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# Importing necessary libraries
import math
import re

# Data handling libraries
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# PyTorch libraries
import torch
from torch import nn
from torch.utils.data import Dataset


# NLP libraries
import nltk


# Multithreading libraries
from multiprocessing import Pool, cpu_count

class BM25:
    def __init__(self, corpus, tokenizer=None):
        self.corpus_size = 0
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []
        self.tokenizer = tokenizer

        if tokenizer:
            corpus = self._tokenize_corpus(corpus)

        nd = self._initialize(corpus)
        self._calc_idf(nd)

    def _initialize(self, corpus):
        nd = {}  # word -> number of documents with word
        num_doc = 0
        for document in corpus:
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in frequencies.items():
                try:
                    nd[word]+=1
                except KeyError:
                    nd[word] = 1

            self.corpus_size += 1

        self.avgdl = num_doc / self.corpus_size
        return nd

    def _tokenize_corpus(self, corpus):
        pool = Pool(cpu_count())
        tokenized_corpus = pool.map(self.tokenizer, corpus)
        return tokenized_corpus

    def _calc_idf(self, nd):
        raise NotImplementedError()

    def get_scores(self, query):
        raise NotImplementedError()

    def get_batch_scores(self, query, doc_ids):
        raise NotImplementedError()

    def get_top_n(self, query, documents, n=5):
        assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"

        scores = self.get_scores(query)
        min_score = np.min(scores)
        max_score = np.max(scores)

        # Scale scores to 0-1 range
        if max_score != min_score:
            scaled_scores = (scores - min_score) / (max_score - min_score)
        else:
            scaled_scores = np.ones(self.corpus_size)

        top_n_indices = np.argsort(scaled_scores)[::-1][:n]
        top_n_scaled_scores = [scaled_scores[i] for i in top_n_indices]

        return [documents[i] for i in top_n_indices], top_n_scaled_scores


class BM25Okapi(BM25):
    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
        self.k1 = k1
        self.b = b
        self.epsilon = epsilon
        super().__init__(corpus, tokenizer)

    def _calc_idf(self, nd):
        """
        Calculates frequencies of terms in documents and in corpus.
        This algorithm sets a floor on the idf values to eps * average_idf
        """
        # collect idf sum to calculate an average idf for epsilon value
        idf_sum = 0
        # collect words with negative idf to set them a special epsilon value.
        # idf can be negative if word is contained in more than half of documents
        negative_idfs = []
        for word, freq in nd.items():
            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = idf_sum / len(self.idf)

        eps = self.epsilon * self.average_idf
        for word in negative_idfs:
            self.idf[word] = eps

    def get_scores(self, query):
        """
        The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
        this algorithm also adds a floor to the idf value of epsilon.
        See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info
        :param query:
        :return:
        """
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score

    def get_batch_scores(self, query, doc_ids):
        """
        Calculate bm25 scores between query and subset of all docs
        """
        assert all(di < len(self.doc_freqs) for di in doc_ids)
        score = np.zeros(len(doc_ids))
        doc_len = np.array(self.doc_len)[doc_ids]
        for q in query:
            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score.tolist()
    
def preprocess_text(text: str) -> str:
    text = re.sub(r"['\",\.\?:\-!]", "", text)
    text = text.strip()
    text = " ".join(text.split())
    text = text.lower()
    return text

def evidence_top_n(context, query):
    sentences = split_text(context)
    tokenized_sentences = [str(doc).split(" ") for doc in sentences]
    bm25 = BM25Okapi(tokenized_sentences)
    tokenized_query = query.split(" ")
    top_docs, top_scores = bm25.get_top_n(tokenized_query, sentences, n=5)

    return top_docs, top_scores

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def similarities(context: list, text: str, weight: list):
    sentences = [text] + context

    encoded_input = tokenizer_sbert(sentences, padding=True, truncation=True, return_tensors='pt')
    encoded_input = {key: value.to('cuda') for key, value in encoded_input.items()}

    # Compute token embeddings
    with torch.no_grad():
        model_output = model_sbert(**encoded_input)
    # Perform pooling. In this case, mean pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    similarities = []
    claim_embeddings = sentence_embeddings[0].unsqueeze(0)
    for i in (range(1, len(sentence_embeddings))):
        evidence_embeddings = sentence_embeddings[i].unsqueeze(0)
        cosine = nn.CosineSimilarity(dim=1, eps=1e-6)
        similarity = cosine(claim_embeddings.to(device), evidence_embeddings.to(device)).item()
        # scaled_similarity = ((similarity + 1) / 2) * weight[i-1]
        similarities.append((sentences[i], similarity))

    simi_values = [s[1] for s in similarities]
    scaler = MinMaxScaler()
    scaled_simi_values = scaler.fit_transform(np.array(simi_values).reshape(-1, 1)).flatten()
    similarities = [(sentences[i+1], scaled_value * weight[i]) for i, scaled_value in enumerate(scaled_simi_values)]

    similarities.sort(key=lambda x: x[1], reverse=True)
    top_k = [item[0] for item in similarities[:1]]
    simi = [item[1] for item in similarities[:1]]
    top_5 = [item[0] for item in similarities[:5]]
    return top_k, simi, top_5

def clean_quotes(sentence):
    # Replace characters within quotes
      return re.sub(r'"([^"]*)"', lambda m: m.group(0).replace('!', '').replace(',', '').replace('?', ''), sentence)

def remove_brackets(text):
    return re.sub(r'\([^)]*\)', lambda m: m.group(0).replace('...', '').replace('.', ''), text)

def split_text(content):
    # Split the text by "\n\n"
    paragraphs = content.split('\n\n')

    # Split each paragraph into sentences
    sentences = []
    for paragraph in paragraphs:
        paragraph = paragraph.replace('...)', ')')
        paragraph = paragraph.replace('... ,', ',')
        paragraph = re.sub(r'\.\.\.(?=\")', '', paragraph)
        paragraph = paragraph.replace('\n', ' ')  # Remove internal line breaks
        paragraph = clean_quotes(paragraph)
        paragraph = re.sub(r'\.(\s[a-z])', lambda match: match.group(1).upper(), paragraph)
        paragraph = paragraph.replace(' .', '.')  # Remove space before period
        paragraph = re.sub(r'\?(?=\s+[a-z])', ' ', paragraph)
        paragraph = re.sub(r'\.\.\.(?=\,)', '', paragraph)
        paragraph = re.sub(r'\.\.\.(?=\s+[a-z])', ' ', paragraph)
        paragraph = paragraph.replace('...', '. ')  # Replace "..." with ". "
        paragraph = paragraph.replace('..', '. ')  # Replace ".." with ". "
        paragraph = paragraph.replace('. ', ' . ')  # Add space after period
        paragraph = paragraph.replace('  ', ' ')  # Remove extra spaces
        paragraph = paragraph.strip()  # Strip leading/trailing spaces

        # Tokenize the paragraph into sentences using NLTK
        paragraph_sentences = nltk.sent_tokenize(paragraph)
        sentences.extend(paragraph_sentences)

    return sentences

class SentencePairDataset(Dataset):
  def __init__(self, sentence_pairs, labels, tokenizer, max_length):
    self.sentence_pairs = sentence_pairs
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.sentence_pairs)

  def __getitem__(self, idx):
    sentence1, sentence2 = self.sentence_pairs[idx]
    label = self.labels[idx]
    encoding = self.tokenizer.encode_plus(
        sentence1,
        text_pair=sentence2,
        add_special_tokens=True,
        max_length=self.max_length,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt",
        truncation=True,
    )
    return {
        "input_ids": encoding["input_ids"].flatten(),
        "attention_mask": encoding["attention_mask"].flatten(),
        "label": torch.tensor(label, dtype=torch.long),
    }
  
class MBERTClassifier(nn.Module):
    def __init__(self, mbert, num_classes):
        super(MBERTClassifier, self).__init__()
        self.mbert = mbert
        self.layer_norm = nn.LayerNorm(self.mbert.config.hidden_size)
        self.dropout = nn.Dropout(0.2)
        self.batch_norm = nn.BatchNorm1d(self.mbert.config.hidden_size)
        self.linear = nn.LazyLinear(num_classes)
        self.activation = nn.ELU()

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.mbert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        norm_output = self.layer_norm(pooled_output)
        batch_norm_output = self.batch_norm(norm_output)
        logits = self.linear(batch_norm_output)
        activated_output = self.activation(logits)
        dropout_output = self.dropout(activated_output)
        return dropout_output

    def predict_proba(self, input_ids, attention_mask):
        logits = self.forward(input_ids, attention_mask)
        probabilities = torch.softmax(logits, dim=-1)
        return probabilities