Spaces:

Sai004
/

ArticlePredictor

Sleeping

App Files Files Community

ArticlePredictor / app.py

Sai004

Update app.py

7bb3b3f over 2 years ago

raw

history blame contribute delete

7.84 kB

	import streamlit as st
	import pandas as pd
	import psycopg2
	import re
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.tag import pos_tag
	from nltk.corpus import stopwords
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import unicodedata

	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('stopwords')

	stop_words = set(stopwords.words('english'))

	def get_paragraph(row, index):
	ans = ''
	for x in row[index]:
	ans = ans + ' ' + x.lower()
	return ans

	def remove_accents(text):
	text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
	return text

	def get_clean_text(row, index):
	if not isinstance(row[index], str):
	return ''
	if row[index] == "NULL":
	return ''
	clean_text = ''
	words = word_tokenize(row[index].lower())
	for word in words:
	word = word.replace(',', ' ')
	word = remove_accents(word)
	if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
	clean_text += ' ' + word
	return clean_text

	def combine(row, indices):
	ans = ''
	for i in indices:
	ans = ans + ' ' + row[i]
	return ans
	# Connect to the PostgreSQL database

	# Read the data from the PostgreSQL table
	query = "SELECT * FROM base_springerdata"
	# st cache
	@st.cache_data # 👈 Add the caching decorator
	def load_data(query):
	conn = psycopg2.connect(
	host="ep-soft-art-878483.ap-southeast-1.aws.neon.tech",
	database="neondb",
	user="Raghuveer22",
	password="pw3tvedja4XU"
	)
	df =pd.read_sql_query(query, conn)
	df = df.drop(['item_doi'], axis=1)
	conn.close()
	return df
	# main_df
	main_df = load_data(query)
	# Close the database connection

	@st.cache_data
	def get_journal_df(df):
	journal_art = df.groupby('publication_title')['item_title'].apply(list).reset_index(name='Articles')
	journal_art.set_index(['publication_title'], inplace=True)

	journal_auth = df.groupby('publication_title')['authors'].apply(list).reset_index(name='authors')
	journal_auth.set_index('publication_title', inplace=True)

	journal_key = df.drop_duplicates(subset=["publication_title", "keywords"], keep='first')
	journal_key = journal_key.drop(['item_title', 'authors', 'publication_year', 'url'], axis=1)
	journal_key.set_index(['publication_title'], inplace=True)

	journal_main = journal_art.join([journal_key, journal_auth])
	print('journal_main intial')
	journal_main.reset_index(inplace=True)
	journal_main['Articles'] = journal_main.apply(get_paragraph, index='Articles', axis=1)
	journal_main['Articles'] = journal_main.apply(get_clean_text, index='Articles', axis=1)
	journal_main['authors'] = journal_main.apply(get_paragraph, index='authors', axis=1)
	journal_main['authors'] = journal_main.apply(get_clean_text, index='authors', axis=1)
	journal_main['keywords'] = journal_main.apply(get_clean_text, index='keywords', axis=1)

	journal_main['Tags'] = journal_main.apply(combine, indices=['keywords', 'Articles', 'authors'], axis=1)
	journal_main['Tags'] = journal_main.apply(get_clean_text, index='Tags', axis=1)
	return journal_main

	journal_main=get_journal_df(main_df)
	print('journal_main processed')
	# Journal Dataframe


	@st.cache_data
	def get_tfidfs(journal_main):
	vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
	journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
	return vectorizer,journal_tfidf_matrix

	vectorizer,journal_tfidf_matrix = get_tfidfs(journal_main)
	print('tfids and vectorizer for journals completed')
	journal_threshold = 4

	def get_journal_index(user_input):
	user_tfidf = vectorizer.transform([user_input])
	cosine_similarities = cosine_similarity(user_tfidf, journal_tfidf_matrix).flatten()
	indices = cosine_similarities.argsort()[::-1]
	top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(journal_threshold, len(indices))]
	return top_recommendations

	def get_article_df(row):
	article = main_df.loc[main_df['publication_title'] == journal_main['publication_title'][row.name]].copy()
	article['item_title'] = article.apply(get_clean_text, index='item_title', axis=1)
	article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
	article['Tokenized'] = article['item_title'].apply(word_tokenize)
	article['Tagged'] = article['Tokenized'].apply(pos_tag)
	article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
	tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
	article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
	article['Tags'] = article.apply(lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
	article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
	article.reset_index(inplace=True)
	article.set_index('index', inplace=True)
	return article



	def get_vectorizer(row):
	vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
	return vectorizer


	def get_tfidf_matrix(row):
	tfidf_matrix = row['article_vectorizer'].fit_transform(row['article_df']['Tags'])
	return tfidf_matrix

	@st.cache_data
	def article_preprocessing(df):
	df['article_df'] = df.apply(get_article_df, axis=1)
	df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
	df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
	return df

	journal_main=article_preprocessing(journal_main)
	print('done')


	article_threshold = 10


	def get_article_recommendations(user_input):
	recommended_journals = get_journal_index(user_input)
	recommendations = []
	for journal_id in recommended_journals:
	user_tfidf = journal_main['article_vectorizer'][journal_id].transform([user_input])
	cosine_similarities = cosine_similarity(user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
	indices = cosine_similarities.argsort()[::-1]
	top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
	cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
	recommendations += top_recommendation_articles
	recommendations.sort(reverse=True)
	return recommendations


	def get_links(user_input):
	recommendations = get_article_recommendations(user_input)
	print(recommendations)
	links = []
	for article in recommendations:
	cosine_similarity, article_id, journal_id = article
	links.append((
	journal_main['article_df'][journal_id].iloc[article_id, 0],
	journal_main['article_df'][journal_id].iloc[article_id, 1],
	article_id,
	journal_id
	))
	print(links)
	return links

	# Define the Streamlit app
	def main():
	st.title("Article Recommendation System")
	st.subheader("Testing application of ScholarSync™")

	# Get user input
	user_input = st.text_area("Enter your interests", height=100)

	# Recommendation button
	if st.button("Recommend Articles"):
	recommendations = get_links(user_input)
	for recommendation in recommendations:
	article_title, article_link, article_id, journal_id = recommendation
	st.markdown(f"Article Title: {article_title}")
	st.markdown(f"Article Link: {article_link}")
	st.markdown(f"Article ID: {article_id}")
	st.markdown(f"Journal ID: {journal_id}")
	st.markdown("---")

	if __name__ == '__main__':
	main()