Sai004 commited on
Commit
d50452c
·
1 Parent(s): 1843061

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +176 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import psycopg2
4
+ import re
5
+ import nltk
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.tag import pos_tag
8
+ from nltk.corpus import stopwords
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ import unicodedata
12
+
13
+ nltk.download('punkt')
14
+ nltk.download('averaged_perceptron_tagger')
15
+ nltk.download('stopwords')
16
+
17
+ # Connect to the PostgreSQL database
18
+ conn = psycopg2.connect(
19
+ host="ep-soft-art-878483.ap-southeast-1.aws.neon.tech",
20
+ database="neondb",
21
+ user="Raghuveer22",
22
+ password="pw3tvedja4XU"
23
+ )
24
+
25
+ # Read the data from the PostgreSQL table
26
+ query = "SELECT * FROM base_springerdata"
27
+ main = pd.read_sql_query(query, conn)
28
+
29
+ # Close the database connection
30
+ conn.close()
31
+
32
+ main = main.drop(['item_doi'], axis=1)
33
+
34
+ # Journal Dataframe
35
+ journal_art = main.groupby('publication_title')['item_title'].apply(list).reset_index(name='Articles')
36
+ journal_art.set_index(['publication_title'], inplace=True)
37
+
38
+ journal_auth = main.groupby('publication_title')['authors'].apply(list).reset_index(name='authors')
39
+ journal_auth.set_index('publication_title', inplace=True)
40
+
41
+ journal_key = main.drop_duplicates(subset=["publication_title", "keywords"], keep='first')
42
+ journal_key = journal_key.drop(['item_title', 'authors', 'publication_year', 'url'], axis=1)
43
+ journal_key.set_index(['publication_title'], inplace=True)
44
+
45
+ journal_main = journal_art.join([journal_key, journal_auth])
46
+ journal_main.reset_index(inplace=True)
47
+
48
+ stop_words = set(stopwords.words('english'))
49
+
50
+ def get_paragraph(row, index):
51
+ ans = ''
52
+ for x in row[index]:
53
+ ans = ans + ' ' + x.lower()
54
+ return ans
55
+
56
+ def remove_accents(text):
57
+ text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
58
+ return text
59
+
60
+ def get_clean_text(row, index):
61
+ if not isinstance(row[index], str):
62
+ return ''
63
+ if row[index] == "NULL":
64
+ return ''
65
+ clean_text = ''
66
+ words = word_tokenize(row[index].lower())
67
+ for word in words:
68
+ word = word.replace(',', ' ')
69
+ word = remove_accents(word)
70
+ if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
71
+ clean_text += ' ' + word
72
+ return clean_text
73
+
74
+ def combine(row, indices):
75
+ ans = ''
76
+ for i in indices:
77
+ ans = ans + ' ' + row[i]
78
+ return ans
79
+
80
+ journal_main['Articles'] = journal_main.apply(get_paragraph, index='Articles', axis=1)
81
+ journal_main['Articles'] = journal_main.apply(get_clean_text, index='Articles', axis=1)
82
+ journal_main['authors'] = journal_main.apply(get_paragraph, index='authors', axis=1)
83
+ journal_main['authors'] = journal_main.apply(get_clean_text, index='authors', axis=1)
84
+ journal_main['keywords'] = journal_main.apply(get_clean_text, index='keywords', axis=1)
85
+
86
+ journal_main['Tags'] = journal_main.apply(combine, indices=['keywords', 'Articles', 'authors'], axis=1)
87
+ journal_main['Tags'] = journal_main.apply(get_clean_text, index='Tags', axis=1)
88
+
89
+ vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
90
+ journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
91
+
92
+ journal_threshold = 4
93
+
94
+ def get_journal_index(user_input):
95
+ user_tfidf = vectorizer.transform([user_input])
96
+ cosine_similarities = cosine_similarity(user_tfidf, journal_tfidf_matrix).flatten()
97
+ indices = cosine_similarities.argsort()[::-1]
98
+ top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(journal_threshold, len(indices))]
99
+ return top_recommendations
100
+
101
+ def get_article_df(row):
102
+ article = main.loc[main['publication_title'] == journal_main['publication_title'][row.name]].copy()
103
+ article['item_title'] = article.apply(get_clean_text, index='item_title', axis=1)
104
+ article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
105
+ article['Tokenized'] = article['item_title'].apply(word_tokenize)
106
+ article['Tagged'] = article['Tokenized'].apply(pos_tag)
107
+ article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
108
+ tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
109
+ article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
110
+ article['Tags'] = article.apply(lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
111
+ article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
112
+ article.reset_index(inplace=True)
113
+ article.set_index('index', inplace=True)
114
+ return article
115
+
116
+ journal_main['article_df'] = journal_main.apply(get_article_df, axis=1)
117
+
118
+ def get_vectorizer(row):
119
+ vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
120
+ return vectorizer
121
+
122
+ def get_tfidf_matrix(row):
123
+ tfidf_matrix = row['article_vectorizer'].fit_transform(row['article_df']['Tags'])
124
+ return tfidf_matrix
125
+
126
+ journal_main['article_vectorizer'] = journal_main.apply(get_vectorizer, axis=1)
127
+ journal_main['article_matrix'] = journal_main.apply(get_tfidf_matrix, axis=1)
128
+
129
+ article_threshold = 10
130
+
131
+ def get_article_recommendations(user_input):
132
+ recommended_journals = get_journal_index(user_input)
133
+ recommendations = []
134
+ for journal_id in recommended_journals:
135
+ user_tfidf = journal_main['article_vectorizer'][journal_id].transform([user_input])
136
+ cosine_similarities = cosine_similarity(user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
137
+ indices = cosine_similarities.argsort()[::-1]
138
+ top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
139
+ cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
140
+ recommendations += top_recommendation_articles
141
+ recommendations.sort(reverse=True)
142
+ return recommendations
143
+
144
+ def get_links(user_input):
145
+ recommendations = get_article_recommendations(user_input)
146
+ links = []
147
+ for article in recommendations:
148
+ cosine_similarity, article_id, journal_id = article
149
+ links.append((
150
+ journal_main['article_df'][journal_id].iloc[article_id, 0],
151
+ journal_main['article_df'][journal_id].iloc[article_id, 1],
152
+ article_id,
153
+ journal_id
154
+ ))
155
+ return links
156
+
157
+ # Define the Streamlit app
158
+ def main():
159
+ st.title("Article Recommendation System")
160
+
161
+ # Get user input
162
+ user_input = st.text_area("Enter your interests", height=100)
163
+
164
+ # Recommendation button
165
+ if st.button("Recommend Articles"):
166
+ recommendations = get_links(user_input)
167
+ for recommendation in recommendations:
168
+ article_title, article_link, article_id, journal_id = recommendation
169
+ st.markdown(f"**Article Title:** {article_title}")
170
+ st.markdown(f"**Article Link:** {article_link}")
171
+ st.markdown(f"**Article ID:** {article_id}")
172
+ st.markdown(f"**Journal ID:** {journal_id}")
173
+ st.markdown("---")
174
+
175
+ if __name__ == '__main__':
176
+ main()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ psycopg2
2
+ pandas
3
+ nltk
4
+ scikit-learn