| | """Generate a similarity matrix (doc-term score matrix) based on textacy.representation.Vectorizer. |
| | |
| | refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer). |
| | originally docterm_scores.py. |
| | """ |
| | from typing import Dict, Iterable, List, Optional, Union |
| | import numpy as np |
| | from itertools import chain |
| | from psutil import virtual_memory |
| | from more_itertools import ilen |
| |
|
| | from textacy.representations import Vectorizer |
| | |
| | from logzero import logger |
| |
|
| | |
| | from gradiobee.gen_model import gen_model |
| |
|
| |
|
| | |
| | def smatrix( |
| | doc1: Iterable[Iterable[str]], |
| | doc2: Iterable[Iterable[str]], |
| | model: Vectorizer = None, |
| | tf_type: str = 'linear', |
| | idf_type: Optional[str] = "smooth", |
| | |
| | dl_type: Optional[str] = None, |
| | norm: Optional[str] = "l2", |
| | min_df: Union[int, float] = 1, |
| | max_df: Union[int, float] = 1.0, |
| | max_n_terms: Optional[int] = None, |
| | vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None |
| | ) -> np.ndarray: |
| | |
| | """Generate a doc-term score matrix based on textacy.representation.Vectorizer. |
| | |
| | Args |
| | doc1: tokenized doc of n1 |
| | doc2: tokenized doc of n2 |
| | model: if None, generate one ad hoc from doc1 and doc2 ("lucene-style tfidf"). |
| | rest: refer to textacy.representation.Vectorizer |
| | Attributes |
| | vectorizer |
| | |
| | Returns |
| | n1 x n2 similarity matrix of float numbers |
| | """ |
| | |
| | try: |
| | for xelm in iter(doc1): |
| | for elm in iter(xelm): |
| | assert isinstance(elm, str) |
| | except AssertionError: |
| | raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ") |
| | except Exception as e: |
| | logger.error(e) |
| | raise |
| | try: |
| | for xelm in iter(doc2): |
| | for elm in iter(xelm): |
| | assert isinstance(elm, str) |
| | except AssertionError: |
| | raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ") |
| | except Exception as e: |
| | logger.error(e) |
| | raise |
| |
|
| | if model is None: |
| | model = gen_model( |
| | [*chain(doc1, doc2)], |
| | tf_type=tf_type, |
| | idf_type=idf_type, |
| | dl_type=dl_type, |
| | norm=norm, |
| | min_df=min_df, |
| | max_df=max_df, |
| | max_n_terms=max_n_terms, |
| | vocabulary_terms=vocabulary_terms |
| | ) |
| | |
| | smatrix.model = model |
| |
|
| | |
| | |
| |
|
| | dt1 = model.transform(doc1) |
| | dt2 = model.transform(doc2) |
| |
|
| | |
| | require_ram = ilen(iter(doc1)) * ilen(iter(doc2)) * 8 |
| | if require_ram > virtual_memory().available: |
| | logger.warning("virtual_memory().available: %s", virtual_memory().available) |
| | logger.warning("memory required: %s", require_ram) |
| |
|
| | if require_ram > virtual_memory().available * 10: |
| | logger.warning("You're likely to encounter memory problem, such as slowing down response and/or OOM.") |
| |
|
| | |
| | return dt2.toarray().dot(dt1.toarray().T) |
| |
|