Sai004 commited on
Commit
f1dd5d7
·
1 Parent(s): 3008106

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +204 -0
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import psycopg2
4
+ import re
5
+ import nltk
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.tag import pos_tag
8
+ from nltk.corpus import stopwords
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ import unicodedata
12
+
13
+ nltk.download('punkt')
14
+ nltk.download('averaged_perceptron_tagger')
15
+ nltk.download('stopwords')
16
+
17
+ # Connect to the PostgreSQL database
18
+
19
+ # Read the data from the PostgreSQL table
20
+ query = "SELECT * FROM base_springerdata"
21
+ # st cache
22
+ @st.cache_data # 👈 Add the caching decorator
23
+ def load_data(query):
24
+ conn = psycopg2.connect(
25
+ host="ep-soft-art-878483.ap-southeast-1.aws.neon.tech",
26
+ database="neondb",
27
+ user="Raghuveer22",
28
+ password="pw3tvedja4XU"
29
+ )
30
+ df =pd.read_sql_query(query, conn)
31
+ df = df.drop(['item_doi'], axis=1)
32
+ conn.close()
33
+ return df
34
+ # main_df
35
+ main_df = load_data(query)
36
+ # Close the database connection
37
+
38
+ @st.cache_data
39
+ def get_journal_df(df):
40
+ journal_art = df.groupby('publication_title')['item_title'].apply(list).reset_index(name='Articles')
41
+ journal_art.set_index(['publication_title'], inplace=True)
42
+
43
+ journal_auth = df.groupby('publication_title')['authors'].apply(list).reset_index(name='authors')
44
+ journal_auth.set_index('publication_title', inplace=True)
45
+
46
+ journal_key = df.drop_duplicates(subset=["publication_title", "keywords"], keep='first')
47
+ journal_key = journal_key.drop(['item_title', 'authors', 'publication_year', 'url'], axis=1)
48
+ journal_key.set_index(['publication_title'], inplace=True)
49
+
50
+ journal_main = journal_art.join([journal_key, journal_auth])
51
+ print('journal_main intial')
52
+ journal_main.reset_index(inplace=True)
53
+ journal_main['Articles'] = journal_main.apply(get_paragraph, index='Articles', axis=1)
54
+ journal_main['Articles'] = journal_main.apply(get_clean_text, index='Articles', axis=1)
55
+ journal_main['authors'] = journal_main.apply(get_paragraph, index='authors', axis=1)
56
+ journal_main['authors'] = journal_main.apply(get_clean_text, index='authors', axis=1)
57
+ journal_main['keywords'] = journal_main.apply(get_clean_text, index='keywords', axis=1)
58
+
59
+ journal_main['Tags'] = journal_main.apply(combine, indices=['keywords', 'Articles', 'authors'], axis=1)
60
+ journal_main['Tags'] = journal_main.apply(get_clean_text, index='Tags', axis=1)
61
+ return journal_main
62
+
63
+ journal_main=get_journal_df(main_df)
64
+ print('journal_main processed')
65
+ # Journal Dataframe
66
+
67
+
68
+ stop_words = set(stopwords.words('english'))
69
+
70
+ def get_paragraph(row, index):
71
+ ans = ''
72
+ for x in row[index]:
73
+ ans = ans + ' ' + x.lower()
74
+ return ans
75
+
76
+ def remove_accents(text):
77
+ text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
78
+ return text
79
+
80
+ def get_clean_text(row, index):
81
+ if not isinstance(row[index], str):
82
+ return ''
83
+ if row[index] == "NULL":
84
+ return ''
85
+ clean_text = ''
86
+ words = word_tokenize(row[index].lower())
87
+ for word in words:
88
+ word = word.replace(',', ' ')
89
+ word = remove_accents(word)
90
+ if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
91
+ clean_text += ' ' + word
92
+ return clean_text
93
+
94
+ def combine(row, indices):
95
+ ans = ''
96
+ for i in indices:
97
+ ans = ans + ' ' + row[i]
98
+ return ans
99
+ @st.cache_data
100
+ def get_tfidfs(journal_main):
101
+ vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
102
+ journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
103
+ return vectorizer,journal_tfidf_matrix
104
+
105
+ vectorizer,journal_tfidf_matrix = get_tfidfs(journal_main)
106
+ print('tfids and vectorizer for journals completed')
107
+ journal_threshold = 4
108
+
109
+ def get_journal_index(user_input):
110
+ user_tfidf = vectorizer.transform([user_input])
111
+ cosine_similarities = cosine_similarity(user_tfidf, journal_tfidf_matrix).flatten()
112
+ indices = cosine_similarities.argsort()[::-1]
113
+ top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(journal_threshold, len(indices))]
114
+ return top_recommendations
115
+
116
+ def get_article_df(row):
117
+ article = main_df.loc[main_df['publication_title'] == journal_main['publication_title'][row.name]].copy()
118
+ article['item_title'] = article.apply(get_clean_text, index='item_title', axis=1)
119
+ article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
120
+ article['Tokenized'] = article['item_title'].apply(word_tokenize)
121
+ article['Tagged'] = article['Tokenized'].apply(pos_tag)
122
+ article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
123
+ tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
124
+ article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
125
+ article['Tags'] = article.apply(lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
126
+ article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
127
+ article.reset_index(inplace=True)
128
+ article.set_index('index', inplace=True)
129
+ return article
130
+
131
+
132
+
133
+ def get_vectorizer(row):
134
+ vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
135
+ return vectorizer
136
+
137
+
138
+ def get_tfidf_matrix(row):
139
+ tfidf_matrix = row['article_vectorizer'].fit_transform(row['article_df']['Tags'])
140
+ return tfidf_matrix
141
+
142
+ @st.cache_data
143
+ def article_preprocessing(df):
144
+ df['article_df'] = df.apply(get_article_df, axis=1)
145
+ df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
146
+ df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
147
+ return df
148
+
149
+ journal_main=article_preprocessing(journal_main)
150
+ print('done')
151
+
152
+
153
+ article_threshold = 10
154
+
155
+
156
+ def get_article_recommendations(user_input):
157
+ recommended_journals = get_journal_index(user_input)
158
+ recommendations = []
159
+ for journal_id in recommended_journals:
160
+ user_tfidf = journal_main['article_vectorizer'][journal_id].transform([user_input])
161
+ cosine_similarities = cosine_similarity(user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
162
+ indices = cosine_similarities.argsort()[::-1]
163
+ top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
164
+ cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
165
+ recommendations += top_recommendation_articles
166
+ recommendations.sort(reverse=True)
167
+ return recommendations
168
+
169
+
170
+ def get_links(user_input):
171
+ recommendations = get_article_recommendations(user_input)
172
+ print(recommendations)
173
+ links = []
174
+ for article in recommendations:
175
+ cosine_similarity, article_id, journal_id = article
176
+ links.append((
177
+ journal_main['article_df'][journal_id].iloc[article_id, 0],
178
+ journal_main['article_df'][journal_id].iloc[article_id, 1],
179
+ article_id,
180
+ journal_id
181
+ ))
182
+ print(links)
183
+ return links
184
+
185
+ # Define the Streamlit app
186
+ def main():
187
+ st.title("Article Recommendation System")
188
+
189
+ # Get user input
190
+ user_input = st.text_area("Enter your interests", height=100)
191
+
192
+ # Recommendation button
193
+ if st.button("Recommend Articles"):
194
+ recommendations = get_links(user_input)
195
+ for recommendation in recommendations:
196
+ article_title, article_link, article_id, journal_id = recommendation
197
+ st.markdown(f"**Article Title:** {article_title}")
198
+ st.markdown(f"**Article Link:** {article_link}")
199
+ st.markdown(f"**Article ID:** {article_id}")
200
+ st.markdown(f"**Journal ID:** {journal_id}")
201
+ st.markdown("---")
202
+
203
+ if __name__ == '__main__':
204
+ main()