Spaces:

Sai004
/

ArticleAPI

Sleeping

App Files Files Community

Sai004 commited on Jul 19, 2023

Commit

8f2e1c5

1 Parent(s): df40eb2

Upload app.py

Browse files

Files changed (1) hide show

app.py +215 -5

app.py CHANGED Viewed

@@ -1,18 +1,228 @@
 import gradio
-def my_inference_function(name):
-  return "Hello " + name + "!"
 gradio_interface = gradio.Interface(
-  fn=my_inference_function,
   inputs="text",
-  outputs="text",
   examples=[
     ["Jill"],
     ["Sam"]
   ],
   title="REST API with Gradio and Huggingface Spaces",
   description="This is a demo of how to build an AI powered REST API with Gradio and Huggingface Spaces – for free! Based on [this article](https://www.tomsoderlund.com/ai/building-ai-powered-rest-api). See the **Use via API** link at the bottom of this page.",
-  article="© Tom Söderlund 2022"
 )
 gradio_interface.launch()

 import gradio
+import pandas as pd
+import psycopg2
+import re
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.tag import pos_tag
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import unicodedata
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('stopwords')
+def get_paragraph(row, index):
+    ans = ''
+    for x in row[index]:
+        ans = ans + ' ' + x.lower()
+    return ans
+def remove_accents(text):
+    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
+    return text
+def get_clean_text(row, index):
+    if not isinstance(row[index], str):
+        return ''
+    if row[index] == "NULL":
+        return ''
+    clean_text = ''
+    words = word_tokenize(row[index].lower())
+    for word in words:
+        word = word.replace(',', ' ')
+        word = remove_accents(word)
+        if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.':
+            clean_text += ' ' + word
+    return clean_text
+def combine(row, indices):
+    ans = ''
+    for i in indices:
+        ans = ans + ' ' + row[i]
+    return ans
+stop_words = set(stopwords.words('english'))
+query = "SELECT * FROM base_springerdata"
+CACHE={}
+SQL_KEY='sql'
+JOURNAL_COMPLETE='journal_complete'
+JOURNAL_PARTIAL='journal_partial'
+VECTORIZER='vectorizer'
+JOURNAL_TFIDF='journal_tfidf'
+# load sql
+def load_sql_data(query):
+    if SQL_KEY in CACHE:
+        return CACHE[SQL_KEY]
+    conn = psycopg2.connect(
+    host="ep-soft-art-878483.ap-southeast-1.aws.neon.tech",
+    database="neondb",
+    user="Raghuveer22",
+    password="pw3tvedja4XU"
+    )
+    df =pd.read_sql_query(query, conn)
+    df = df.drop(['item_doi'], axis=1)
+    conn.close()
+    CACHE[SQL_KEY] = df
+    return df
+# main_df
+main_df = load_sql_data(query)
+# Close the database connection
+# load journal_df
+def get_journal_df(df):
+    if JOURNAL_PARTIAL in CACHE:
+        return CACHE[JOURNAL_PARTIAL]
+    journal_art = df.groupby('publication_title')['item_title'].apply(list).reset_index(name='Articles')
+    journal_art.set_index(['publication_title'], inplace=True)
+    journal_auth = df.groupby('publication_title')['authors'].apply(list).reset_index(name='authors')
+    journal_auth.set_index('publication_title', inplace=True)
+    journal_key = df.drop_duplicates(subset=["publication_title", "keywords"], keep='first')
+    journal_key = journal_key.drop(['item_title', 'authors', 'publication_year', 'url'], axis=1)
+    journal_key.set_index(['publication_title'], inplace=True)
+    journal_main = journal_art.join([journal_key, journal_auth])
+    print('journal_main intial')
+    journal_main.reset_index(inplace=True)
+    journal_main['Articles'] = journal_main.apply(get_paragraph, index='Articles', axis=1)
+    journal_main['Articles'] = journal_main.apply(get_clean_text, index='Articles', axis=1)
+    journal_main['authors'] = journal_main.apply(get_paragraph, index='authors', axis=1)
+    journal_main['authors'] = journal_main.apply(get_clean_text, index='authors', axis=1)
+    journal_main['keywords'] = journal_main.apply(get_clean_text, index='keywords', axis=1)
+    journal_main['Tags'] = journal_main.apply(combine, indices=['keywords', 'Articles', 'authors'], axis=1)
+    journal_main['Tags'] = journal_main.apply(get_clean_text, index='Tags', axis=1)
+    CACHE[JOURNAL_PARTIAL]=journal_main
+    return journal_main
+journal_main=get_journal_df(main_df)
+print('journal_main processed')
+# Journal Dataframe
+# load tfidfs
+def get_tfidfs(journal_main):
+    if VECTORIZER and JOURNAL_TFIDF in CACHE:
+        return CACHE[VECTORIZER],CACHE[JOURNAL_TFIDF]
+    vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
+    journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
+    CACHE[VECTORIZER]=vectorizer
+    CACHE[JOURNAL_TFIDF]=journal_tfidf_matrix
+    return vectorizer,journal_tfidf_matrix
+vectorizer,journal_tfidf_matrix  = get_tfidfs(journal_main)
+print('tfids and vectorizer for journals completed')
+def get_article_df(row):
+    article = main_df.loc[main_df['publication_title'] == journal_main['publication_title'][row.name]].copy()
+    article['item_title'] = article.apply(get_clean_text, index='item_title', axis=1)
+    article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
+    article['Tokenized'] = article['item_title'].apply(word_tokenize)
+    article['Tagged'] = article['Tokenized'].apply(pos_tag)
+    article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
+                                                         tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
+    article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
+    article['Tags'] = article.apply(lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
+    article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
+    article.reset_index(inplace=True)
+    article.set_index('index', inplace=True)
+    return article
+def get_vectorizer(row):
+    vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
+    return vectorizer
+def get_tfidf_matrix(row):
+    tfidf_matrix = row['article_vectorizer'].fit_transform(row['article_df']['Tags'])
+    return tfidf_matrix
+def article_preprocessing(df):
+    if JOURNAL_COMPLETE in CACHE:
+        return CACHE[JOURNAL_COMPLETE]
+    df['article_df'] = df.apply(get_article_df, axis=1)
+    df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
+    df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
+    CACHE[JOURNAL_COMPLETE]=df
+    return df
+journal_main=article_preprocessing(journal_main)
+print('done')
+# #### prediction
+journal_threshold = 4
+def get_journal_index(user_input):
+    user_tfidf = vectorizer.transform([user_input])
+    cosine_similarities = cosine_similarity(user_tfidf, journal_tfidf_matrix).flatten()
+    indices = cosine_similarities.argsort()[::-1]
+    top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(journal_threshold, len(indices))]
+    return top_recommendations
+article_threshold = 10
+def get_article_recommendations(user_input):
+    recommended_journals = get_journal_index(user_input)
+    recommendations = []
+    for journal_id in recommended_journals:
+        user_tfidf = journal_main['article_vectorizer'][journal_id].transform([user_input])
+        cosine_similarities = cosine_similarity(user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
+        indices = cosine_similarities.argsort()[::-1]
+        top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
+                                       cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
+        recommendations += top_recommendation_articles
+    recommendations.sort(reverse=True)
+    return recommendations
+def get_links(user_input):
+    recommendations = get_article_recommendations(user_input)
+    print(recommendations)
+    links = []
+    for article in recommendations:
+        cosine_similarity, article_id, journal_id = article
+        links.append((
+            journal_main['article_df'][journal_id].iloc[article_id, 0],
+            journal_main['article_df'][journal_id].iloc[article_id, 1],
+            article_id,
+            journal_id
+        ))
+    print(links)
+    return links
 gradio_interface = gradio.Interface(
+  fn=get_links,
   inputs="text",
+  outputs="list",
   examples=[
     ["Jill"],
     ["Sam"]
   ],
   title="REST API with Gradio and Huggingface Spaces",
   description="This is a demo of how to build an AI powered REST API with Gradio and Huggingface Spaces – for free! Based on [this article](https://www.tomsoderlund.com/ai/building-ai-powered-rest-api). See the **Use via API** link at the bottom of this page.",
+  article="© POSA MOKSHITH 2023"
 )
 gradio_interface.launch()