Spaces:

Sai004
/

ArticleAPI

Sleeping

App Files Files Community

Sai004 commited on Jul 19, 2023

Commit

7c4da79

1 Parent(s): c078a46

Updated app.py

Browse files

Files changed (1) hide show

app.py +97 -71

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio
 import pandas as pd
 import psycopg2
@@ -14,16 +15,20 @@ nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('stopwords')
 def get_paragraph(row, index):
     ans = ''
     for x in row[index]:
         ans = ans + ' ' + x.lower()
     return ans
 def remove_accents(text):
-    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
     return text
 def get_clean_text(row, index):
     if not isinstance(row[index], str):
         return ''
@@ -38,22 +43,23 @@ def get_clean_text(row, index):
             clean_text += ' ' + word
     return clean_text
 def combine(row, indices):
     ans = ''
     for i in indices:
         ans = ans + ' ' + row[i]
     return ans
 stop_words = set(stopwords.words('english'))
 query = "SELECT * FROM base_springerdata"
-CACHE={}
-SQL_KEY='sql'
-JOURNAL_COMPLETE='journal_complete'
-JOURNAL_PARTIAL='journal_partial'
-VECTORIZER='vectorizer'
-JOURNAL_TFIDF='journal_tfidf'
-import os
 # Access the secrets
 HOST = os.getenv('DATABASE_HOST')
@@ -61,131 +67,149 @@ DATABASE = os.getenv('DATABASE_NAME')
 USER = os.getenv('DATABASE_USER')
 PASSWORD = os.getenv('DATABASE_PASSWORD')
 # load sql
 def load_sql_data(query):
     if SQL_KEY in CACHE:
         return CACHE[SQL_KEY]
     conn = psycopg2.connect(
-    host=HOST,
-    database=DATABASE,
-    user=USER,
-    password=PASSWORD,
-    sslmode="require"
     )
-    df =pd.read_sql_query(query, conn)
     df = df.drop(['item_doi'], axis=1)
     conn.close()
     CACHE[SQL_KEY] = df
     return df
 # main_df
 main_df = load_sql_data(query)
-# Close the database connection
 # load journal_df
 def get_journal_df(df):
     if JOURNAL_PARTIAL in CACHE:
         return CACHE[JOURNAL_PARTIAL]
-    journal_art = df.groupby('publication_title')['item_title'].apply(list).reset_index(name='Articles')
     journal_art.set_index(['publication_title'], inplace=True)
-    journal_auth = df.groupby('publication_title')['authors'].apply(list).reset_index(name='authors')
     journal_auth.set_index('publication_title', inplace=True)
-    journal_key = df.drop_duplicates(subset=["publication_title", "keywords"], keep='first')
-    journal_key = journal_key.drop(['item_title', 'authors', 'publication_year', 'url'], axis=1)
     journal_key.set_index(['publication_title'], inplace=True)
     journal_main = journal_art.join([journal_key, journal_auth])
     print('journal_main intial')
     journal_main.reset_index(inplace=True)
-    journal_main['Articles'] = journal_main.apply(get_paragraph, index='Articles', axis=1)
-    journal_main['Articles'] = journal_main.apply(get_clean_text, index='Articles', axis=1)
-    journal_main['authors'] = journal_main.apply(get_paragraph, index='authors', axis=1)
-    journal_main['authors'] = journal_main.apply(get_clean_text, index='authors', axis=1)
-    journal_main['keywords'] = journal_main.apply(get_clean_text, index='keywords', axis=1)
-    journal_main['Tags'] = journal_main.apply(combine, indices=['keywords', 'Articles', 'authors'], axis=1)
-    journal_main['Tags'] = journal_main.apply(get_clean_text, index='Tags', axis=1)
-    CACHE[JOURNAL_PARTIAL]=journal_main
     return journal_main
-journal_main=get_journal_df(main_df)
-print('journal_main processed')
 # Journal Dataframe
-# load tfidfs
 def get_tfidfs(journal_main):
     if VECTORIZER and JOURNAL_TFIDF in CACHE:
-        return CACHE[VECTORIZER],CACHE[JOURNAL_TFIDF]
     vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
     journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
-    CACHE[VECTORIZER]=vectorizer
-    CACHE[JOURNAL_TFIDF]=journal_tfidf_matrix
-    return vectorizer,journal_tfidf_matrix
-vectorizer,journal_tfidf_matrix  = get_tfidfs(journal_main)
 print('tfids and vectorizer for journals completed')
 def get_article_df(row):
-    article = main_df.loc[main_df['publication_title'] == journal_main['publication_title'][row.name]].copy()
-    article['item_title'] = article.apply(get_clean_text, index='item_title', axis=1)
     article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
     article['Tokenized'] = article['item_title'].apply(word_tokenize)
     article['Tagged'] = article['Tokenized'].apply(pos_tag)
     article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
                                                          tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
     article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
-    article['Tags'] = article.apply(lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
-    article = article.drop(['keywords', 'publication_title', 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
     article.reset_index(inplace=True)
     article.set_index('index', inplace=True)
     return article
 def get_vectorizer(row):
     vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
     return vectorizer
 def get_tfidf_matrix(row):
-    tfidf_matrix = row['article_vectorizer'].fit_transform(row['article_df']['Tags'])
     return tfidf_matrix
 def article_preprocessing(df):
     if JOURNAL_COMPLETE in CACHE:
         return CACHE[JOURNAL_COMPLETE]
     df['article_df'] = df.apply(get_article_df, axis=1)
     df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
     df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
-    CACHE[JOURNAL_COMPLETE]=df
     return df
-journal_main=article_preprocessing(journal_main)
-print('done')
-# #### prediction
 journal_threshold = 4
 def get_journal_index(user_input):
     user_tfidf = vectorizer.transform([user_input])
-    cosine_similarities = cosine_similarity(user_tfidf, journal_tfidf_matrix).flatten()
     indices = cosine_similarities.argsort()[::-1]
-    top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(journal_threshold, len(indices))]
     return top_recommendations
 article_threshold = 10
@@ -193,8 +217,10 @@ def get_article_recommendations(user_input):
     recommended_journals = get_journal_index(user_input)
     recommendations = []
     for journal_id in recommended_journals:
-        user_tfidf = journal_main['article_vectorizer'][journal_id].transform([user_input])
-        cosine_similarities = cosine_similarity(user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
         indices = cosine_similarities.argsort()[::-1]
         top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
                                        cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
@@ -218,18 +244,18 @@ def get_links(user_input):
     return links
 gradio_interface = gradio.Interface(
-  fn=get_links,
-  inputs="text",
-  outputs=gradio.outputs.JSON(),
-  examples=[
-    ["Jill"],
-    ["Sam"]
-  ],
-  title="REST API with Gradio and Huggingface Spaces",
-  description="This is a demo of how to build an AI powered REST API with Gradio and Huggingface Spaces – for free! Based on [this article](https://www.tomsoderlund.com/ai/building-ai-powered-rest-api). See the **Use via API** link at the bottom of this page.",
-  article="© POSA MOKSHITH 2023"
 )
-gradio_interface.launch()

+import os
 import gradio
 import pandas as pd
 import psycopg2
 nltk.download('averaged_perceptron_tagger')
 nltk.download('stopwords')
 def get_paragraph(row, index):
     ans = ''
     for x in row[index]:
         ans = ans + ' ' + x.lower()
     return ans
 def remove_accents(text):
+    text = unicodedata.normalize('NFKD', text).encode(
+        'ASCII', 'ignore').decode('utf-8')
     return text
 def get_clean_text(row, index):
     if not isinstance(row[index], str):
         return ''
             clean_text += ' ' + word
     return clean_text
 def combine(row, indices):
     ans = ''
     for i in indices:
         ans = ans + ' ' + row[i]
     return ans
 stop_words = set(stopwords.words('english'))
 query = "SELECT * FROM base_springerdata"
+CACHE = {}
+SQL_KEY = 'sql'
+JOURNAL_COMPLETE = 'journal_complete'
+JOURNAL_PARTIAL = 'journal_partial'
+VECTORIZER = 'vectorizer'
+JOURNAL_TFIDF = 'journal_tfidf'
 # Access the secrets
 HOST = os.getenv('DATABASE_HOST')
 USER = os.getenv('DATABASE_USER')
 PASSWORD = os.getenv('DATABASE_PASSWORD')
 # load sql
 def load_sql_data(query):
     if SQL_KEY in CACHE:
         return CACHE[SQL_KEY]
     conn = psycopg2.connect(
+        host=HOST,
+        database=DATABASE,
+        user=USER,
+        password=PASSWORD
     )
+    df = pd.read_sql_query(query, conn)
     df = df.drop(['item_doi'], axis=1)
+    # Close the database connection
     conn.close()
     CACHE[SQL_KEY] = df
     return df
 # main_df
 main_df = load_sql_data(query)
 # load journal_df
 def get_journal_df(df):
     if JOURNAL_PARTIAL in CACHE:
         return CACHE[JOURNAL_PARTIAL]
+    journal_art = df.groupby('publication_title')['item_title'].apply(
+        list).reset_index(name='Articles')
     journal_art.set_index(['publication_title'], inplace=True)
+    journal_auth = df.groupby('publication_title')['authors'].apply(
+        list).reset_index(name='authors')
     journal_auth.set_index('publication_title', inplace=True)
+    journal_key = df.drop_duplicates(
+        subset=["publication_title", "keywords"], keep='first')
+    journal_key = journal_key.drop(
+        ['item_title', 'authors', 'publication_year', 'url'], axis=1)
     journal_key.set_index(['publication_title'], inplace=True)
     journal_main = journal_art.join([journal_key, journal_auth])
     print('journal_main intial')
     journal_main.reset_index(inplace=True)
+    journal_main['Articles'] = journal_main.apply(
+        get_paragraph, index='Articles', axis=1)
+    journal_main['Articles'] = journal_main.apply(
+        get_clean_text, index='Articles', axis=1)
+    journal_main['authors'] = journal_main.apply(
+        get_paragraph, index='authors', axis=1)
+    journal_main['authors'] = journal_main.apply(
+        get_clean_text, index='authors', axis=1)
+    journal_main['keywords'] = journal_main.apply(
+        get_clean_text, index='keywords', axis=1)
+    journal_main['Tags'] = journal_main.apply(
+        combine, indices=['keywords', 'Articles', 'authors'], axis=1)
+    journal_main['Tags'] = journal_main.apply(
+        get_clean_text, index='Tags', axis=1)
+    CACHE[JOURNAL_PARTIAL] = journal_main
     return journal_main
 # Journal Dataframe
+journal_main = get_journal_df(main_df)
+print('journal_main processed')
+# load tfidfs
 def get_tfidfs(journal_main):
     if VECTORIZER and JOURNAL_TFIDF in CACHE:
+        return CACHE[VECTORIZER], CACHE[JOURNAL_TFIDF]
     vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
     journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags'])
+    CACHE[VECTORIZER] = vectorizer
+    CACHE[JOURNAL_TFIDF] = journal_tfidf_matrix
+    return vectorizer, journal_tfidf_matrix
+vectorizer, journal_tfidf_matrix = get_tfidfs(journal_main)
 print('tfids and vectorizer for journals completed')
 def get_article_df(row):
+    article = main_df.loc[main_df['publication_title'] ==
+                          journal_main['publication_title'][row.name]].copy()
+    article['item_title'] = article.apply(
+        get_clean_text, index='item_title', axis=1)
     article['authors'] = article.apply(get_clean_text, index='authors', axis=1)
     article['Tokenized'] = article['item_title'].apply(word_tokenize)
     article['Tagged'] = article['Tokenized'].apply(pos_tag)
     article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if
                                                          tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words])
     article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1)
+    article['Tags'] = article.apply(
+        lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1)
+    article = article.drop(['keywords', 'publication_title',
+                           'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1)
     article.reset_index(inplace=True)
     article.set_index('index', inplace=True)
     return article
 def get_vectorizer(row):
     vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii')
     return vectorizer
 def get_tfidf_matrix(row):
+    tfidf_matrix = row['article_vectorizer'].fit_transform(
+        row['article_df']['Tags'])
     return tfidf_matrix
 def article_preprocessing(df):
     if JOURNAL_COMPLETE in CACHE:
         return CACHE[JOURNAL_COMPLETE]
     df['article_df'] = df.apply(get_article_df, axis=1)
     df['article_vectorizer'] = df.apply(get_vectorizer, axis=1)
     df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1)
+    CACHE[JOURNAL_COMPLETE] = df
     return df
+journal_main = article_preprocessing(journal_main)
+print('done')
+# prediction
 journal_threshold = 4
 def get_journal_index(user_input):
     user_tfidf = vectorizer.transform([user_input])
+    cosine_similarities = cosine_similarity(
+        user_tfidf, journal_tfidf_matrix).flatten()
     indices = cosine_similarities.argsort()[::-1]
+    top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min(
+        journal_threshold, len(indices))]
     return top_recommendations
 article_threshold = 10
     recommended_journals = get_journal_index(user_input)
     recommendations = []
     for journal_id in recommended_journals:
+        user_tfidf = journal_main['article_vectorizer'][journal_id].transform([
+                                                                              user_input])
+        cosine_similarities = cosine_similarity(
+            user_tfidf, journal_main['article_matrix'][journal_id]).flatten()
         indices = cosine_similarities.argsort()[::-1]
         top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if
                                        cosine_similarities[i] > 0][:min(article_threshold, len(indices))]
     return links
 gradio_interface = gradio.Interface(
+    fn=get_links,
+    inputs="text",
+    outputs=gradio.outputs.JSON(),
+    examples=[
+        ["AI"],
+        ["Biochemicals"],
+        ["Rocket Science"]
+    ],
+    title="Sprinkler Article Generator API",
+    description="This is a AI powered REST API with Gradio and Huggingface Spaces – for free! Based on [this article](https://www.tomsoderlund.com/ai/building-ai-powered-rest-api). See the **Use via API** link at the bottom of this page.",
+    article="© ScholarSync 2023"
 )
+gradio_interface.launch(share=True)