Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| import numpy as np | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from .text import tokenize, tokenized_documents | |
| def tfidf_tokenizer(text: str) -> list[str]: | |
| return tokenize(text, remove_stopwords=False, min_len=2) | |
| class DenseTransformer(BaseEstimator, TransformerMixin): | |
| def fit(self, x, y=None): | |
| return self | |
| def transform(self, x): | |
| return x.toarray() if hasattr(x, "toarray") else x | |
| def vectorize_with_w2v(model, texts: list[str], vector_size: int) -> np.ndarray: | |
| docs = tokenized_documents(texts, remove_stopwords=False) | |
| matrix = np.zeros((len(docs), vector_size), dtype=np.float32) | |
| for row_idx, tokens in enumerate(docs): | |
| vectors = [model.wv[token] for token in tokens if token in model.wv] | |
| if vectors: | |
| matrix[row_idx] = np.mean(vectors, axis=0) | |
| return matrix | |
| class W2VBundle: | |
| word2vec: object | |
| classifier: object | |
| vector_size: int | |
| def predict(self, texts: list[str]) -> np.ndarray: | |
| return self.classifier.predict(vectorize_with_w2v(self.word2vec, texts, self.vector_size)) | |
| def predict_proba(self, texts: list[str]): | |
| features = vectorize_with_w2v(self.word2vec, texts, self.vector_size) | |
| if hasattr(self.classifier, "predict_proba"): | |
| return self.classifier.predict_proba(features) | |
| return None | |
| def decision_function(self, texts: list[str]): | |
| features = vectorize_with_w2v(self.word2vec, texts, self.vector_size) | |
| if hasattr(self.classifier, "decision_function"): | |
| return self.classifier.decision_function(features) | |
| return None | |