Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from gensim import corpora | |
| from gensim import similarities | |
| from gensim.models import TfidfModel | |
| from gensim.parsing import strip_tags, strip_numeric, \ | |
| strip_multiple_whitespaces, stem_text, strip_punctuation, \ | |
| remove_stopwords, preprocess_string | |
| import re | |
| from typing import List | |
| from utils.constants import TEST_INPUTS | |
| import argparse | |
| from random import choice | |
| transform_to_lower = lambda s: s.lower() | |
| remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s) | |
| class PaperRecommender: | |
| def __init__(self, | |
| num_samples=3000, | |
| corpus_dictionary_path="30Ktokens", | |
| arxiv_dataset_path="/Users/luis.morales/Desktop/arxiv-paper-recommender/data/processed/reduced_arxiv_papers.parquet.gzip", | |
| save_dict=False, | |
| query=""): | |
| self.num_samples = num_samples | |
| self.corpus_dictionary_path = corpus_dictionary_path | |
| self.arxiv_dataset_path = arxiv_dataset_path | |
| self.save_dict = save_dict | |
| self.query = query | |
| self.cleaning_filters = [ | |
| strip_tags, | |
| strip_numeric, | |
| strip_punctuation, | |
| strip_multiple_whitespaces, | |
| transform_to_lower, | |
| remove_stopwords, | |
| remove_single_char | |
| ] | |
| self.dictionary = None | |
| self.index = None | |
| self.tfidf_model = None | |
| self.df = None | |
| def gensim_tokenizer(self, docs: List[str]): | |
| tokenized_docs = list() | |
| for doc in docs: | |
| processed_words = preprocess_string(doc, self.cleaning_filters) | |
| tokenized_docs.append(processed_words) | |
| return tokenized_docs | |
| def cleaning_pipe(self, document): | |
| processed_words = preprocess_string(document, self.cleaning_filters) | |
| return processed_words | |
| def get_gensim_dictionary(self, tokenized_docs: List[str], dict_name: str = "corpus"): | |
| dictionary = corpora.Dictionary(tokenized_docs) | |
| if self.save_dict: | |
| parent_folder = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models/nlp_dictionaries" | |
| dictionary.save(f'{parent_folder}/{dict_name}.dict') | |
| return dictionary | |
| def get_closest_n(self, query: str, n: int): | |
| query_document = self.cleaning_pipe(query) | |
| query_bow = self.dictionary.doc2bow(query_document) | |
| sims = self.index[self.tfidf_model[query_bow]] | |
| top_idx = sims.argsort()[-1 * n:][::-1] | |
| return top_idx | |
| def get_recommendations_metadata(self, query: str, n: int): | |
| recommendations_idxs = self.get_closest_n(query, n) | |
| recommendations_metadata = self.df.iloc[recommendations_idxs] | |
| recommendations_metadata = recommendations_metadata.reset_index(drop=True) | |
| return recommendations_metadata | |
| def run_recommender(self): | |
| if self.num_samples is None: | |
| self.df = pd.read_parquet(self.arxiv_dataset_path) | |
| self.df = pd.read_parquet(self.arxiv_dataset_path).sample(self.num_samples).reset_index(drop=True) | |
| corpus = self.df['cleaned_abstracts'].to_list() | |
| tokenized_corpus = self.gensim_tokenizer(corpus) | |
| self.dictionary = self.get_gensim_dictionary(tokenized_docs=tokenized_corpus, dict_name=self.corpus_dictionary_path) | |
| BoW_corpus = [self.dictionary.doc2bow(doc, allow_update=True) for doc in tokenized_corpus] | |
| self.tfidf_model = TfidfModel(BoW_corpus) | |
| self.index = similarities.SparseMatrixSimilarity(self.tfidf_model[BoW_corpus], num_features=len(self.dictionary)) | |
| if self.query is None: | |
| self.query = choice(TEST_INPUTS) | |
| return self.results | |