File size: 1,706 Bytes
4ce2b3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from __future__ import annotations

from dataclasses import dataclass

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

from .text import tokenize, tokenized_documents


def tfidf_tokenizer(text: str) -> list[str]:
    return tokenize(text, remove_stopwords=False, min_len=2)


class DenseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return x.toarray() if hasattr(x, "toarray") else x


def vectorize_with_w2v(model, texts: list[str], vector_size: int) -> np.ndarray:
    docs = tokenized_documents(texts, remove_stopwords=False)
    matrix = np.zeros((len(docs), vector_size), dtype=np.float32)
    for row_idx, tokens in enumerate(docs):
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        if vectors:
            matrix[row_idx] = np.mean(vectors, axis=0)
    return matrix


@dataclass
class W2VBundle:
    word2vec: object
    classifier: object
    vector_size: int

    def predict(self, texts: list[str]) -> np.ndarray:
        return self.classifier.predict(vectorize_with_w2v(self.word2vec, texts, self.vector_size))

    def predict_proba(self, texts: list[str]):
        features = vectorize_with_w2v(self.word2vec, texts, self.vector_size)
        if hasattr(self.classifier, "predict_proba"):
            return self.classifier.predict_proba(features)
        return None

    def decision_function(self, texts: list[str]):
        features = vectorize_with_w2v(self.word2vec, texts, self.vector_size)
        if hasattr(self.classifier, "decision_function"):
            return self.classifier.decision_function(features)
        return None