Spaces:
Sleeping
Sleeping
File size: 7,835 Bytes
f1dd5d7 9a131ac f1dd5d7 7bb3b3f f1dd5d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import streamlit as st
import pandas as pd
import psycopg2
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import unicodedata
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def get_paragraph(row, index):
ans = ''
for x in row[index]:
ans = ans + ' ' + x.lower()
return ans
def remove_accents(text):
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
return text
def get_clean_text(row, index):
if not isinstance(row[index], str):
return ''
if row[index] == "NULL":
return ''
clean_text = ''
words = word_tokenize(row[index].lower())
for word in words:
word = word.replace(',', ' ')
word = remove_accents(word)
if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
clean_text += ' ' + word
return clean_text
def combine(row, indices):
ans = ''
for i in indices:
ans = ans + ' ' + row[i]
return ans
# Connect to the PostgreSQL database
# Read the data from the PostgreSQL table
query = "SELECT * FROM base_springerdata"
# st cache
@st.cache_data # 👈 Add the caching decorator
def load_data(query):
conn = psycopg2.connect(
host="ep-soft-art-878483.ap-southeast-1.aws.neon.tech",
database="neondb",
user="Raghuveer22",
password="pw3tvedja4XU"
)
df =pd.read_sql_query(query, conn)
df = df.drop(['item_doi'], axis=1)
conn.close()
return df
# main_df
main_df = load_data(query)
# Close the database connection
@st.cache_data
def get_journal_df(df):
journal_art = df.groupby('publication_title')['item_title'].apply(list).reset_index(name='Articles')
journal_art.set_index(['publication_title'], inplace=True)
journal_auth = df.groupby('publication_title')['authors'].apply(list).reset_index(name='authors')
journal_auth.set_index('publication_title', inplace=True)
journal_key = df.drop_duplicates(subset=["publication_title", "keywords"], keep='first')
journal_key = journal_key.drop(['item_title', 'authors', 'publication_year', 'url'], axis=1)
journal_key.set_index(['publication_title'], inplace=True)
journal_main = journal_art.join([journal_key, journal_auth])
print('journal_main intial')
journal_main.reset_index(inplace=True)
journal_main['Articles'] = journal_main.apply(get_paragraph, index='Articles', axis=1)
journal_main['Articles'] = journal_main.apply(get_clean_text, index='Articles', axis=1)
journal_main['authors'] = journal_main.apply(get_paragraph, index='authors', axis=1)
journal_main['authors'] = journal_main.apply(get_clean_text, index='authors', axis=1)
journal_main['keywords'] = journal_main.apply(get_clean_text, index='keywords', axis=1)
journal_main['Tags'] = journal_main.apply(combine, indices=['keywords', 'Articles', 'authors'], axis=1)
journal_main['Tags'] = journal_main.apply(get_clean_text, index='Tags', axis=1)
return journal_main
journal_main=get_journal_df(main_df)
print('journal_main processed')
# Journal Dataframe
@st.cache_data
def get_tfidfs(journal_main):
vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
return vectorizer,journal_tfidf_matrix
vectorizer,journal_tfidf_matrix = get_tfidfs(journal_main)
print('tfids and vectorizer for journals completed')
journal_threshold = 4
def get_journal_index(user_input):
user_tfidf = vectorizer.transform([user_input])
cosine_similarities = cosine_similarity(user_tfidf, journal_tfidf_matrix).flatten()
indices = cosine_similarities.argsort()[::-1]
top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(journal_threshold, len(indices))]
return top_recommendations
def get_article_df(row):
article = main_df.loc[main_df['publication_title'] == journal_main['publication_title'][row.name]].copy()
article['item_title'] = article.apply(get_clean_text, index='item_title', axis=1)
article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
article['Tokenized'] = article['item_title'].apply(word_tokenize)
article['Tagged'] = article['Tokenized'].apply(pos_tag)
article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
article['Tags'] = article.apply(lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
article.reset_index(inplace=True)
article.set_index('index', inplace=True)
return article
def get_vectorizer(row):
vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
return vectorizer
def get_tfidf_matrix(row):
tfidf_matrix = row['article_vectorizer'].fit_transform(row['article_df']['Tags'])
return tfidf_matrix
@st.cache_data
def article_preprocessing(df):
df['article_df'] = df.apply(get_article_df, axis=1)
df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
return df
journal_main=article_preprocessing(journal_main)
print('done')
article_threshold = 10
def get_article_recommendations(user_input):
recommended_journals = get_journal_index(user_input)
recommendations = []
for journal_id in recommended_journals:
user_tfidf = journal_main['article_vectorizer'][journal_id].transform([user_input])
cosine_similarities = cosine_similarity(user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
indices = cosine_similarities.argsort()[::-1]
top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
recommendations += top_recommendation_articles
recommendations.sort(reverse=True)
return recommendations
def get_links(user_input):
recommendations = get_article_recommendations(user_input)
print(recommendations)
links = []
for article in recommendations:
cosine_similarity, article_id, journal_id = article
links.append((
journal_main['article_df'][journal_id].iloc[article_id, 0],
journal_main['article_df'][journal_id].iloc[article_id, 1],
article_id,
journal_id
))
print(links)
return links
# Define the Streamlit app
def main():
st.title("Article Recommendation System")
st.subheader("Testing application of ScholarSync™")
# Get user input
user_input = st.text_area("Enter your interests", height=100)
# Recommendation button
if st.button("Recommend Articles"):
recommendations = get_links(user_input)
for recommendation in recommendations:
article_title, article_link, article_id, journal_id = recommendation
st.markdown(f"**Article Title:** {article_title}")
st.markdown(f"**Article Link:** {article_link}")
st.markdown(f"**Article ID:** {article_id}")
st.markdown(f"**Journal ID:** {journal_id}")
st.markdown("---")
if __name__ == '__main__':
main()
|