Spaces:
Running
Running
File size: 1,706 Bytes
4ce2b3e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | from __future__ import annotations
from dataclasses import dataclass
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from .text import tokenize, tokenized_documents
def tfidf_tokenizer(text: str) -> list[str]:
return tokenize(text, remove_stopwords=False, min_len=2)
class DenseTransformer(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def transform(self, x):
return x.toarray() if hasattr(x, "toarray") else x
def vectorize_with_w2v(model, texts: list[str], vector_size: int) -> np.ndarray:
docs = tokenized_documents(texts, remove_stopwords=False)
matrix = np.zeros((len(docs), vector_size), dtype=np.float32)
for row_idx, tokens in enumerate(docs):
vectors = [model.wv[token] for token in tokens if token in model.wv]
if vectors:
matrix[row_idx] = np.mean(vectors, axis=0)
return matrix
@dataclass
class W2VBundle:
word2vec: object
classifier: object
vector_size: int
def predict(self, texts: list[str]) -> np.ndarray:
return self.classifier.predict(vectorize_with_w2v(self.word2vec, texts, self.vector_size))
def predict_proba(self, texts: list[str]):
features = vectorize_with_w2v(self.word2vec, texts, self.vector_size)
if hasattr(self.classifier, "predict_proba"):
return self.classifier.predict_proba(features)
return None
def decision_function(self, texts: list[str]):
features = vectorize_with_w2v(self.word2vec, texts, self.vector_size)
if hasattr(self.classifier, "decision_function"):
return self.classifier.decision_function(features)
return None
|