| """Generate a doc-term score matrix based on textacy.representation.Vectorizer. |
| |
| refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer). |
| """ |
| from typing import Dict, Iterable, List, Optional, Union |
| import numpy as np |
| from itertools import chain |
| from psutil import virtual_memory |
| from more_itertools import ilen |
|
|
| from textacy.representations import Vectorizer |
| from logzero import logger |
|
|
| from gradiobee.gen_model import gen_model |
|
|
|
|
| |
| def docterm_scores( |
| doc1: Iterable[Iterable[str]], |
| doc2: Iterable[Iterable[str]], |
| model: Vectorizer = None, |
| tf_type: str = 'linear', |
| idf_type: Optional[str] = "smooth", |
| |
| dl_type: Optional[str] = None, |
| norm: Optional[str] = "l2", |
| min_df: Union[int, float] = 1, |
| max_df: Union[int, float] = 1.0, |
| max_n_terms: Optional[int] = None, |
| vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None |
| ) -> np.ndarray: |
| |
| """Generate a doc-term score matrix based on textacy.representation.Vectorizer. |
| |
| Args |
| doc1: tokenized doc of n1 |
| doc2: tokenized doc of n2 |
| model: if None, generate one ad hoc from doc1 and doc2 ("lucene-style tfidf"). |
| rest: refer to textacy.representation.Vectorizer |
| Attributes |
| vectorizer |
| |
| Returns |
| n1 x n2 similarity matrix of float numbers |
| """ |
| |
| try: |
| for xelm in iter(doc1): |
| for elm in iter(xelm): |
| assert isinstance(elm, str) |
| except AssertionError: |
| raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ") |
| except Exception as e: |
| logger.error(e) |
| raise |
| try: |
| for xelm in iter(doc2): |
| for elm in iter(xelm): |
| assert isinstance(elm, str) |
| except AssertionError: |
| raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ") |
| except Exception as e: |
| logger.error(e) |
| raise |
|
|
| if model is None: |
| model = gen_model( |
| [*chain(doc1, doc2)], |
| tf_type=tf_type, |
| idf_type=idf_type, |
| dl_type=dl_type, |
| norm=norm, |
| min_df=min_df, |
| max_df=max_df, |
| max_n_terms=max_n_terms, |
| vocabulary_terms=vocabulary_terms |
| ) |
| docterm_scores.model = model |
|
|
| |
| |
|
|
| dt1 = model.transform(doc1) |
| dt2 = model.transform(doc2) |
|
|
| |
| require_ram = ilen(iter(doc1)) * ilen(iter(doc2)) * 8 |
| if require_ram > virtual_memory().available: |
| logger.warning("virtual_memory().available: %s", virtual_memory().available) |
| logger.warning("memory required: %s", require_ram) |
|
|
| if require_ram > virtual_memory().available * 10: |
| logger.warning("You'll likely encounter memory problem, such as slow down response and/or OOM.") |
|
|
| |
| return dt2.toarray().dot(dt1.toarray().T) |
|
|