Sai004 commited on
Commit
8f2e1c5
·
1 Parent(s): df40eb2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -5
app.py CHANGED
@@ -1,18 +1,228 @@
1
  import gradio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def my_inference_function(name):
4
- return "Hello " + name + "!"
5
 
6
  gradio_interface = gradio.Interface(
7
- fn=my_inference_function,
8
  inputs="text",
9
- outputs="text",
10
  examples=[
11
  ["Jill"],
12
  ["Sam"]
13
  ],
14
  title="REST API with Gradio and Huggingface Spaces",
15
  description="This is a demo of how to build an AI powered REST API with Gradio and Huggingface Spaces – for free! Based on [this article](https://www.tomsoderlund.com/ai/building-ai-powered-rest-api). See the **Use via API** link at the bottom of this page.",
16
- article="© Tom Söderlund 2022"
17
  )
18
  gradio_interface.launch()
 
1
  import gradio
2
+ import pandas as pd
3
+ import psycopg2
4
+ import re
5
+ import nltk
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.tag import pos_tag
8
+ from nltk.corpus import stopwords
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ import unicodedata
12
+
13
+ nltk.download('punkt')
14
+ nltk.download('averaged_perceptron_tagger')
15
+ nltk.download('stopwords')
16
+
17
+ def get_paragraph(row, index):
18
+ ans = ''
19
+ for x in row[index]:
20
+ ans = ans + ' ' + x.lower()
21
+ return ans
22
+
23
+ def remove_accents(text):
24
+ text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
25
+ return text
26
+
27
+ def get_clean_text(row, index):
28
+ if not isinstance(row[index], str):
29
+ return ''
30
+ if row[index] == "NULL":
31
+ return ''
32
+ clean_text = ''
33
+ words = word_tokenize(row[index].lower())
34
+ for word in words:
35
+ word = word.replace(',', ' ')
36
+ word = remove_accents(word)
37
+ if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
38
+ clean_text += ' ' + word
39
+ return clean_text
40
+
41
+ def combine(row, indices):
42
+ ans = ''
43
+ for i in indices:
44
+ ans = ans + ' ' + row[i]
45
+ return ans
46
+
47
+ stop_words = set(stopwords.words('english'))
48
+ query = "SELECT * FROM base_springerdata"
49
+
50
+ CACHE={}
51
+ SQL_KEY='sql'
52
+ JOURNAL_COMPLETE='journal_complete'
53
+ JOURNAL_PARTIAL='journal_partial'
54
+ VECTORIZER='vectorizer'
55
+ JOURNAL_TFIDF='journal_tfidf'
56
+
57
+ # load sql
58
+ def load_sql_data(query):
59
+ if SQL_KEY in CACHE:
60
+ return CACHE[SQL_KEY]
61
+ conn = psycopg2.connect(
62
+ host="ep-soft-art-878483.ap-southeast-1.aws.neon.tech",
63
+ database="neondb",
64
+ user="Raghuveer22",
65
+ password="pw3tvedja4XU"
66
+ )
67
+ df =pd.read_sql_query(query, conn)
68
+ df = df.drop(['item_doi'], axis=1)
69
+ conn.close()
70
+ CACHE[SQL_KEY] = df
71
+ return df
72
+ # main_df
73
+ main_df = load_sql_data(query)
74
+ # Close the database connection
75
+
76
+
77
+ # load journal_df
78
+
79
+ def get_journal_df(df):
80
+ if JOURNAL_PARTIAL in CACHE:
81
+ return CACHE[JOURNAL_PARTIAL]
82
+ journal_art = df.groupby('publication_title')['item_title'].apply(list).reset_index(name='Articles')
83
+ journal_art.set_index(['publication_title'], inplace=True)
84
+
85
+ journal_auth = df.groupby('publication_title')['authors'].apply(list).reset_index(name='authors')
86
+ journal_auth.set_index('publication_title', inplace=True)
87
+
88
+ journal_key = df.drop_duplicates(subset=["publication_title", "keywords"], keep='first')
89
+ journal_key = journal_key.drop(['item_title', 'authors', 'publication_year', 'url'], axis=1)
90
+ journal_key.set_index(['publication_title'], inplace=True)
91
+
92
+ journal_main = journal_art.join([journal_key, journal_auth])
93
+ print('journal_main intial')
94
+ journal_main.reset_index(inplace=True)
95
+ journal_main['Articles'] = journal_main.apply(get_paragraph, index='Articles', axis=1)
96
+ journal_main['Articles'] = journal_main.apply(get_clean_text, index='Articles', axis=1)
97
+ journal_main['authors'] = journal_main.apply(get_paragraph, index='authors', axis=1)
98
+ journal_main['authors'] = journal_main.apply(get_clean_text, index='authors', axis=1)
99
+ journal_main['keywords'] = journal_main.apply(get_clean_text, index='keywords', axis=1)
100
+
101
+ journal_main['Tags'] = journal_main.apply(combine, indices=['keywords', 'Articles', 'authors'], axis=1)
102
+ journal_main['Tags'] = journal_main.apply(get_clean_text, index='Tags', axis=1)
103
+ CACHE[JOURNAL_PARTIAL]=journal_main
104
+ return journal_main
105
+
106
+ journal_main=get_journal_df(main_df)
107
+ print('journal_main processed')
108
+ # Journal Dataframe
109
+
110
+ # load tfidfs
111
+
112
+ def get_tfidfs(journal_main):
113
+ if VECTORIZER and JOURNAL_TFIDF in CACHE:
114
+ return CACHE[VECTORIZER],CACHE[JOURNAL_TFIDF]
115
+ vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
116
+ journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
117
+ CACHE[VECTORIZER]=vectorizer
118
+ CACHE[JOURNAL_TFIDF]=journal_tfidf_matrix
119
+ return vectorizer,journal_tfidf_matrix
120
+
121
+ vectorizer,journal_tfidf_matrix = get_tfidfs(journal_main)
122
+ print('tfids and vectorizer for journals completed')
123
+
124
+ def get_article_df(row):
125
+ article = main_df.loc[main_df['publication_title'] == journal_main['publication_title'][row.name]].copy()
126
+ article['item_title'] = article.apply(get_clean_text, index='item_title', axis=1)
127
+ article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
128
+ article['Tokenized'] = article['item_title'].apply(word_tokenize)
129
+ article['Tagged'] = article['Tokenized'].apply(pos_tag)
130
+ article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
131
+ tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
132
+ article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
133
+ article['Tags'] = article.apply(lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
134
+ article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
135
+ article.reset_index(inplace=True)
136
+ article.set_index('index', inplace=True)
137
+ return article
138
+
139
+
140
+
141
+ def get_vectorizer(row):
142
+ vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
143
+ return vectorizer
144
+
145
+
146
+ def get_tfidf_matrix(row):
147
+ tfidf_matrix = row['article_vectorizer'].fit_transform(row['article_df']['Tags'])
148
+ return tfidf_matrix
149
+
150
+ def article_preprocessing(df):
151
+ if JOURNAL_COMPLETE in CACHE:
152
+ return CACHE[JOURNAL_COMPLETE]
153
+ df['article_df'] = df.apply(get_article_df, axis=1)
154
+ df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
155
+ df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
156
+ CACHE[JOURNAL_COMPLETE]=df
157
+ return df
158
+
159
+ journal_main=article_preprocessing(journal_main)
160
+ print('done')
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+ # #### prediction
173
+ journal_threshold = 4
174
+
175
+ def get_journal_index(user_input):
176
+ user_tfidf = vectorizer.transform([user_input])
177
+ cosine_similarities = cosine_similarity(user_tfidf, journal_tfidf_matrix).flatten()
178
+ indices = cosine_similarities.argsort()[::-1]
179
+ top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(journal_threshold, len(indices))]
180
+ return top_recommendations
181
+
182
+ article_threshold = 10
183
+
184
+
185
+ def get_article_recommendations(user_input):
186
+ recommended_journals = get_journal_index(user_input)
187
+ recommendations = []
188
+ for journal_id in recommended_journals:
189
+ user_tfidf = journal_main['article_vectorizer'][journal_id].transform([user_input])
190
+ cosine_similarities = cosine_similarity(user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
191
+ indices = cosine_similarities.argsort()[::-1]
192
+ top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
193
+ cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
194
+ recommendations += top_recommendation_articles
195
+ recommendations.sort(reverse=True)
196
+ return recommendations
197
+
198
+
199
+ def get_links(user_input):
200
+ recommendations = get_article_recommendations(user_input)
201
+ print(recommendations)
202
+ links = []
203
+ for article in recommendations:
204
+ cosine_similarity, article_id, journal_id = article
205
+ links.append((
206
+ journal_main['article_df'][journal_id].iloc[article_id, 0],
207
+ journal_main['article_df'][journal_id].iloc[article_id, 1],
208
+ article_id,
209
+ journal_id
210
+ ))
211
+ print(links)
212
+ return links
213
+
214
 
 
 
215
 
216
  gradio_interface = gradio.Interface(
217
+ fn=get_links,
218
  inputs="text",
219
+ outputs="list",
220
  examples=[
221
  ["Jill"],
222
  ["Sam"]
223
  ],
224
  title="REST API with Gradio and Huggingface Spaces",
225
  description="This is a demo of how to build an AI powered REST API with Gradio and Huggingface Spaces – for free! Based on [this article](https://www.tomsoderlund.com/ai/building-ai-powered-rest-api). See the **Use via API** link at the bottom of this page.",
226
+ article="© POSA MOKSHITH 2023"
227
  )
228
  gradio_interface.launch()