Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import psycopg2 | |
| import re | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.tag import pos_tag | |
| from nltk.corpus import stopwords | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import unicodedata | |
| nltk.download('punkt') | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('stopwords') | |
| stop_words = set(stopwords.words('english')) | |
| def get_paragraph(row, index): | |
| ans = '' | |
| for x in row[index]: | |
| ans = ans + ' ' + x.lower() | |
| return ans | |
| def remove_accents(text): | |
| text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8') | |
| return text | |
| def get_clean_text(row, index): | |
| if not isinstance(row[index], str): | |
| return '' | |
| if row[index] == "NULL": | |
| return '' | |
| clean_text = '' | |
| words = word_tokenize(row[index].lower()) | |
| for word in words: | |
| word = word.replace(',', ' ') | |
| word = remove_accents(word) | |
| if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.': | |
| clean_text += ' ' + word | |
| return clean_text | |
| def combine(row, indices): | |
| ans = '' | |
| for i in indices: | |
| ans = ans + ' ' + row[i] | |
| return ans | |
| # Connect to the PostgreSQL database | |
| # Read the data from the PostgreSQL table | |
| query = "SELECT * FROM base_springerdata" | |
| # st cache | |
| # 👈 Add the caching decorator | |
| def load_data(query): | |
| conn = psycopg2.connect( | |
| host="ep-soft-art-878483.ap-southeast-1.aws.neon.tech", | |
| database="neondb", | |
| user="Raghuveer22", | |
| password="pw3tvedja4XU" | |
| ) | |
| df =pd.read_sql_query(query, conn) | |
| df = df.drop(['item_doi'], axis=1) | |
| conn.close() | |
| return df | |
| # main_df | |
| main_df = load_data(query) | |
| # Close the database connection | |
| def get_journal_df(df): | |
| journal_art = df.groupby('publication_title')['item_title'].apply(list).reset_index(name='Articles') | |
| journal_art.set_index(['publication_title'], inplace=True) | |
| journal_auth = df.groupby('publication_title')['authors'].apply(list).reset_index(name='authors') | |
| journal_auth.set_index('publication_title', inplace=True) | |
| journal_key = df.drop_duplicates(subset=["publication_title", "keywords"], keep='first') | |
| journal_key = journal_key.drop(['item_title', 'authors', 'publication_year', 'url'], axis=1) | |
| journal_key.set_index(['publication_title'], inplace=True) | |
| journal_main = journal_art.join([journal_key, journal_auth]) | |
| print('journal_main intial') | |
| journal_main.reset_index(inplace=True) | |
| journal_main['Articles'] = journal_main.apply(get_paragraph, index='Articles', axis=1) | |
| journal_main['Articles'] = journal_main.apply(get_clean_text, index='Articles', axis=1) | |
| journal_main['authors'] = journal_main.apply(get_paragraph, index='authors', axis=1) | |
| journal_main['authors'] = journal_main.apply(get_clean_text, index='authors', axis=1) | |
| journal_main['keywords'] = journal_main.apply(get_clean_text, index='keywords', axis=1) | |
| journal_main['Tags'] = journal_main.apply(combine, indices=['keywords', 'Articles', 'authors'], axis=1) | |
| journal_main['Tags'] = journal_main.apply(get_clean_text, index='Tags', axis=1) | |
| return journal_main | |
| journal_main=get_journal_df(main_df) | |
| print('journal_main processed') | |
| # Journal Dataframe | |
| def get_tfidfs(journal_main): | |
| vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii') | |
| journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags']) | |
| return vectorizer,journal_tfidf_matrix | |
| vectorizer,journal_tfidf_matrix = get_tfidfs(journal_main) | |
| print('tfids and vectorizer for journals completed') | |
| journal_threshold = 4 | |
| def get_journal_index(user_input): | |
| user_tfidf = vectorizer.transform([user_input]) | |
| cosine_similarities = cosine_similarity(user_tfidf, journal_tfidf_matrix).flatten() | |
| indices = cosine_similarities.argsort()[::-1] | |
| top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(journal_threshold, len(indices))] | |
| return top_recommendations | |
| def get_article_df(row): | |
| article = main_df.loc[main_df['publication_title'] == journal_main['publication_title'][row.name]].copy() | |
| article['item_title'] = article.apply(get_clean_text, index='item_title', axis=1) | |
| article['authors'] = article.apply(get_clean_text, index='authors', axis=1) | |
| article['Tokenized'] = article['item_title'].apply(word_tokenize) | |
| article['Tagged'] = article['Tokenized'].apply(pos_tag) | |
| article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if | |
| tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words]) | |
| article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1) | |
| article['Tags'] = article.apply(lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1) | |
| article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1) | |
| article.reset_index(inplace=True) | |
| article.set_index('index', inplace=True) | |
| return article | |
| def get_vectorizer(row): | |
| vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii') | |
| return vectorizer | |
| def get_tfidf_matrix(row): | |
| tfidf_matrix = row['article_vectorizer'].fit_transform(row['article_df']['Tags']) | |
| return tfidf_matrix | |
| def article_preprocessing(df): | |
| df['article_df'] = df.apply(get_article_df, axis=1) | |
| df['article_vectorizer'] = df.apply(get_vectorizer, axis=1) | |
| df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1) | |
| return df | |
| journal_main=article_preprocessing(journal_main) | |
| print('done') | |
| article_threshold = 10 | |
| def get_article_recommendations(user_input): | |
| recommended_journals = get_journal_index(user_input) | |
| recommendations = [] | |
| for journal_id in recommended_journals: | |
| user_tfidf = journal_main['article_vectorizer'][journal_id].transform([user_input]) | |
| cosine_similarities = cosine_similarity(user_tfidf, journal_main['article_matrix'][journal_id]).flatten() | |
| indices = cosine_similarities.argsort()[::-1] | |
| top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if | |
| cosine_similarities[i] > 0][:min(article_threshold, len(indices))] | |
| recommendations += top_recommendation_articles | |
| recommendations.sort(reverse=True) | |
| return recommendations | |
| def get_links(user_input): | |
| recommendations = get_article_recommendations(user_input) | |
| print(recommendations) | |
| links = [] | |
| for article in recommendations: | |
| cosine_similarity, article_id, journal_id = article | |
| links.append(( | |
| journal_main['article_df'][journal_id].iloc[article_id, 0], | |
| journal_main['article_df'][journal_id].iloc[article_id, 1], | |
| article_id, | |
| journal_id | |
| )) | |
| print(links) | |
| return links | |
| # Define the Streamlit app | |
| def main(): | |
| st.title("Article Recommendation System") | |
| st.subheader("Testing application of ScholarSync™") | |
| # Get user input | |
| user_input = st.text_area("Enter your interests", height=100) | |
| # Recommendation button | |
| if st.button("Recommend Articles"): | |
| recommendations = get_links(user_input) | |
| for recommendation in recommendations: | |
| article_title, article_link, article_id, journal_id = recommendation | |
| st.markdown(f"**Article Title:** {article_title}") | |
| st.markdown(f"**Article Link:** {article_link}") | |
| st.markdown(f"**Article ID:** {article_id}") | |
| st.markdown(f"**Journal ID:** {journal_id}") | |
| st.markdown("---") | |
| if __name__ == '__main__': | |
| main() | |