Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from gensim.corpora import Dictionary | |
| from gensim.similarities import SparseMatrixSimilarity | |
| from gensim.models import TfidfModel | |
| from gensim.parsing import strip_tags, strip_numeric, \ | |
| strip_multiple_whitespaces, stem_text, strip_punctuation, \ | |
| remove_stopwords, preprocess_string | |
| from re import sub | |
| from typing import List | |
| from functools import cache | |
| transform_to_lower = lambda s: s.lower() | |
| remove_single_char = lambda s: sub(r'\s+\w{1}\s+', '', s) | |
| cleaning_filters = [ | |
| strip_tags, | |
| strip_numeric, | |
| strip_punctuation, | |
| strip_multiple_whitespaces, | |
| transform_to_lower, | |
| remove_stopwords, | |
| remove_single_char | |
| ] | |
| def gensim_tokenizer(docs: List[str]): | |
| """ | |
| Tokenizes a list of strings using a series of cleaning filters. | |
| Args: | |
| docs (List[str]): A list of strings to be tokenized. | |
| Returns: | |
| List[List[str]]: A list of tokenized documents, where each document is represented as a list of tokens. | |
| """ | |
| tokenized_docs = list() | |
| for doc in docs: | |
| processed_words = preprocess_string(doc, cleaning_filters) | |
| tokenized_docs.append(processed_words) | |
| return tokenized_docs | |
| def cleaning_pipe(document): | |
| """ | |
| Applies a series of cleaning steps to a document. | |
| Args: | |
| document (str): The document to be cleaned. | |
| Returns: | |
| list: A list of processed words after applying the cleaning filters. | |
| """ | |
| # Invoking gensim.parsing.preprocess_string method with set of filters | |
| processed_words = preprocess_string(document, cleaning_filters) | |
| return processed_words | |
| def get_closest_n(dictionary: Dictionary, index: SparseMatrixSimilarity, tfidf_model : TfidfModel, query: str, n: int): | |
| ''' | |
| Retrieves the top matching documents as per cosine similarity | |
| between the TF-IDF vector of the query and all documents. | |
| Args: | |
| query (str): The query string to find matching documents. | |
| n (int): The number of closest documents to retrieve. | |
| Returns: | |
| numpy.ndarray: An array of indices representing the top matching documents. | |
| ''' | |
| # Clean the query document using cleaning_pipe function | |
| query_document = cleaning_pipe(query) | |
| # Convert the query document to bag-of-words representation | |
| query_bow = dictionary.doc2bow(query_document) | |
| # Calculate similarity scores between the query and all documents using TF-IDF model | |
| sims = index[tfidf_model[query_bow]] | |
| # Get the indices of the top n closest documents based on similarity scores | |
| top_idx = sims.argsort()[-1 * n:][::-1] | |
| return top_idx | |
| def get_recomendations_metadata(query: str, df: pd.DataFrame, n: int, | |
| dictionary: Dictionary, index: SparseMatrixSimilarity, | |
| tfidf_model : TfidfModel) -> pd.DataFrame: | |
| ''' | |
| Retrieves metadata recommendations based on a query using cosine similarity. | |
| Args: | |
| query (str): The query string for which recommendations are sought. | |
| n (int): The number of recommendations to retrieve. | |
| df (pd.DataFrame): The DataFrame containing metadata information. | |
| Returns: | |
| pd.DataFrame: A DataFrame containing the recommended metadata, reset with a new index. | |
| ''' | |
| # Get the indices of the closest matching documents based on the query | |
| recommendations_idxs = get_closest_n(dictionary, index, tfidf_model, query, n) | |
| # Retrieve the recommended metadata rows from the DataFrame based on the indices | |
| recommendations_metadata = df.iloc[recommendations_idxs] | |
| # Reset the index of the recommended metadata DataFrame | |
| recommendations_metadata = recommendations_metadata.reset_index(drop=True) | |
| return recommendations_metadata | |
| # return recommendations_idxs | |
| def load_arxiv_parquet(path: str): | |
| df = pd.read_parquet(path) | |
| return df | |
| def load_dict(path: str): | |
| dict_corpus = Dictionary.load(path) | |
| return dict_corpus | |
| def load_model(path: str ): | |
| tfidf_model = TfidfModel.load(path) | |
| return tfidf_model | |
| def load_sparse_matrix(path: str): | |
| similarities = SparseMatrixSimilarity.load(path) | |
| return similarities |