Spaces:
Sleeping
Sleeping
| import os | |
| import gradio | |
| import pandas as pd | |
| import psycopg2 | |
| import re | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.tag import pos_tag | |
| from nltk.corpus import stopwords | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import unicodedata | |
| import json | |
| nltk.download('punkt') | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('stopwords') | |
| def get_paragraph(row, index): | |
| ans = '' | |
| for x in row[index]: | |
| ans = ans + ' ' + x.lower() | |
| return ans | |
| def remove_accents(text): | |
| text = unicodedata.normalize('NFKD', text).encode( | |
| 'ASCII', 'ignore').decode('utf-8') | |
| return text | |
| def get_clean_text(row, index): | |
| if not isinstance(row[index], str): | |
| return '' | |
| if row[index] == "NULL": | |
| return '' | |
| clean_text = '' | |
| words = word_tokenize(row[index].lower()) | |
| for word in words: | |
| word = word.replace(',', ' ') | |
| word = remove_accents(word) | |
| if re.match(r'^[a-zA-Z]+$', word) and word not in stop_words and len(word) > 1 and word[1] != '.': | |
| clean_text += ' ' + word | |
| return clean_text | |
| def combine(row, indices): | |
| ans = '' | |
| for i in indices: | |
| ans = ans + ' ' + row[i] | |
| return ans | |
| stop_words = set(stopwords.words('english')) | |
| query = "SELECT * FROM base_springerdata" | |
| CACHE = {} | |
| SQL_KEY = 'sql' | |
| JOURNAL_COMPLETE = 'journal_complete' | |
| JOURNAL_PARTIAL = 'journal_partial' | |
| VECTORIZER = 'vectorizer' | |
| JOURNAL_TFIDF = 'journal_tfidf' | |
| # Access the secrets | |
| HOST = os.getenv('DATABASE_HOST') | |
| DATABASE = os.getenv('DATABASE_NAME') | |
| USER = os.getenv('DATABASE_USER') | |
| PASSWORD = os.getenv('DATABASE_PASSWORD') | |
| # load sql | |
| def load_sql_data(query): | |
| if SQL_KEY in CACHE: | |
| return CACHE[SQL_KEY] | |
| try: | |
| conn = psycopg2.connect( | |
| host=HOST, | |
| database=DATABASE, | |
| user=USER, | |
| password=PASSWORD | |
| ) | |
| df = pd.read_sql_query(query, conn) | |
| df = df.drop(['item_doi'], axis=1) | |
| # Close the database connection | |
| conn.close() | |
| CACHE[SQL_KEY] = df | |
| return df | |
| except psycopg2.Error: | |
| # If there is an error connecting to the database, load data from the compressed CSV file | |
| df = pd.read_csv('compressed_data.bz2', compression='bz2') | |
| df = df.drop(['item_doi'], axis=1) | |
| CACHE[SQL_KEY] = df | |
| return df | |
| # main_df | |
| main_df = load_sql_data(query) | |
| # load journal_df | |
| def get_journal_df(df): | |
| if JOURNAL_PARTIAL in CACHE: | |
| return CACHE[JOURNAL_PARTIAL] | |
| journal_art = df.groupby('publication_title')['item_title'].apply( | |
| list).reset_index(name='Articles') | |
| journal_art.set_index(['publication_title'], inplace=True) | |
| journal_auth = df.groupby('publication_title')['authors'].apply( | |
| list).reset_index(name='authors') | |
| journal_auth.set_index('publication_title', inplace=True) | |
| journal_key = df.drop_duplicates( | |
| subset=["publication_title", "keywords"], keep='first') | |
| journal_key = journal_key.drop( | |
| ['item_title', 'authors', 'publication_year', 'url'], axis=1) | |
| journal_key.set_index(['publication_title'], inplace=True) | |
| journal_main = journal_art.join([journal_key, journal_auth]) | |
| print('journal_main intial') | |
| journal_main.reset_index(inplace=True) | |
| journal_main['Articles'] = journal_main.apply( | |
| get_paragraph, index='Articles', axis=1) | |
| journal_main['Articles'] = journal_main.apply( | |
| get_clean_text, index='Articles', axis=1) | |
| journal_main['authors'] = journal_main.apply( | |
| get_paragraph, index='authors', axis=1) | |
| journal_main['authors'] = journal_main.apply( | |
| get_clean_text, index='authors', axis=1) | |
| journal_main['keywords'] = journal_main.apply( | |
| get_clean_text, index='keywords', axis=1) | |
| journal_main['Tags'] = journal_main.apply( | |
| combine, indices=['keywords', 'Articles', 'authors'], axis=1) | |
| journal_main['Tags'] = journal_main.apply( | |
| get_clean_text, index='Tags', axis=1) | |
| CACHE[JOURNAL_PARTIAL] = journal_main | |
| return journal_main | |
| # Journal Dataframe | |
| journal_main = get_journal_df(main_df) | |
| print('journal_main processed') | |
| # load tfidfs | |
| def get_tfidfs(journal_main): | |
| if VECTORIZER and JOURNAL_TFIDF in CACHE: | |
| return CACHE[VECTORIZER], CACHE[JOURNAL_TFIDF] | |
| vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii') | |
| journal_tfidf_matrix = vectorizer.fit_transform(journal_main['Tags']) | |
| CACHE[VECTORIZER] = vectorizer | |
| CACHE[JOURNAL_TFIDF] = journal_tfidf_matrix | |
| return vectorizer, journal_tfidf_matrix | |
| vectorizer, journal_tfidf_matrix = get_tfidfs(journal_main) | |
| print('tfids and vectorizer for journals completed') | |
| def get_article_df(row): | |
| article = main_df.loc[main_df['publication_title'] == | |
| journal_main['publication_title'][row.name]].copy() | |
| article['item_title'] = article.apply( | |
| get_clean_text, index='item_title', axis=1) | |
| article['authors'] = article.apply(get_clean_text, index='authors', axis=1) | |
| article['Tokenized'] = article['item_title'].apply(word_tokenize) | |
| article['Tagged'] = article['Tokenized'].apply(pos_tag) | |
| article['Tags'] = article['Tagged'].apply(lambda x: [word for word, tag in x if | |
| tag.startswith('NN') or tag.startswith('JJ') and word.lower() not in stop_words]) | |
| article['Tags'] = article.apply(get_paragraph, index='Tags', axis=1) | |
| article['Tags'] = article.apply( | |
| lambda x: x['Tags'] + ' ' + x['authors'] + ' ' + str(x['publication_year']), axis=1) | |
| article = article.drop(['keywords', 'publication_title', | |
| 'Tokenized', 'Tagged', 'authors', 'publication_year'], axis=1) | |
| article.reset_index(inplace=True) | |
| article.set_index('index', inplace=True) | |
| return article | |
| def get_vectorizer(row): | |
| vectorizer = TfidfVectorizer(decode_error='ignore', strip_accents='ascii') | |
| return vectorizer | |
| def get_tfidf_matrix(row): | |
| tfidf_matrix = row['article_vectorizer'].fit_transform( | |
| row['article_df']['Tags']) | |
| return tfidf_matrix | |
| def article_preprocessing(df): | |
| if JOURNAL_COMPLETE in CACHE: | |
| return CACHE[JOURNAL_COMPLETE] | |
| df['article_df'] = df.apply(get_article_df, axis=1) | |
| df['article_vectorizer'] = df.apply(get_vectorizer, axis=1) | |
| df['article_matrix'] = df.apply(get_tfidf_matrix, axis=1) | |
| CACHE[JOURNAL_COMPLETE] = df | |
| return df | |
| journal_main = article_preprocessing(journal_main) | |
| print('done') | |
| # prediction | |
| journal_threshold = 4 | |
| def get_journal_index(user_input): | |
| user_tfidf = vectorizer.transform([user_input]) | |
| cosine_similarities = cosine_similarity( | |
| user_tfidf, journal_tfidf_matrix).flatten() | |
| indices = cosine_similarities.argsort()[::-1] | |
| top_recommendations = [i for i in indices if cosine_similarities[i] > 0][:min( | |
| journal_threshold, len(indices))] | |
| return top_recommendations | |
| article_threshold = 10 | |
| def get_article_recommendations(user_input): | |
| recommended_journals = get_journal_index(user_input) | |
| recommendations = [] | |
| for journal_id in recommended_journals: | |
| user_tfidf = journal_main['article_vectorizer'][journal_id].transform([ | |
| user_input]) | |
| cosine_similarities = cosine_similarity( | |
| user_tfidf, journal_main['article_matrix'][journal_id]).flatten() | |
| indices = cosine_similarities.argsort()[::-1] | |
| top_recommendation_articles = [(cosine_similarities[i], i, journal_id) for i in indices if | |
| cosine_similarities[i] > 0][:min(article_threshold, len(indices))] | |
| recommendations += top_recommendation_articles | |
| recommendations.sort(reverse=True) | |
| return recommendations | |
| def validation(text): | |
| words = word_tokenize(text) | |
| # Perform part-of-speech tagging | |
| tagged_words = pos_tag(words) | |
| # Check if any adjective or noun is present | |
| adjectives = [word for word, pos in tagged_words if pos.startswith('JJ')] | |
| nouns = [word for word, pos in tagged_words if pos.startswith('NN')] | |
| result = {} | |
| if not adjectives and not nouns: | |
| result['validation'] = 'invalid' | |
| else: | |
| adjective_str = ' '.join(adjectives) | |
| noun_str = ' '.join(nouns) | |
| combined_sentence = f"{adjective_str} {noun_str}".strip() | |
| result['validation'] = 'valid' | |
| result['sentence'] = combined_sentence | |
| return result | |
| def get_links(user_input): | |
| check=validation(user_input) | |
| if check['validation'] == 'valid': | |
| recommendations = get_article_recommendations(check['sentence']) | |
| links = [] | |
| for article in recommendations: | |
| cosine_similarity, article_id, journal_id = article | |
| link = { | |
| "title": journal_main['article_df'][journal_id].iloc[article_id, 0], | |
| "url": journal_main['article_df'][journal_id].iloc[article_id, 1], | |
| "article_id": int(article_id), | |
| "journal_id": int(journal_id) | |
| } | |
| links.append(link) | |
| return links | |
| else: | |
| return [] | |
| validation_interface = gradio.Interface( | |
| fn=validation, | |
| inputs="text", | |
| outputs=gradio.outputs.JSON(), | |
| title="Validation API - Testing API of ScholarSync", | |
| description="API to validate user input" | |
| ) | |
| links_interface = gradio.Interface( | |
| fn=get_links, | |
| inputs="text", | |
| outputs=gradio.outputs.JSON(), | |
| examples=[ | |
| ["AI"], | |
| ["Biochemicals"], | |
| ["Rocket Science"] | |
| ], | |
| title="Article Links Generator API - Testing API of ScholarSync", | |
| description="API to generate article recommendations based on user input" | |
| ) | |
| # Combine interfaces into a single app | |
| app = gradio.TabbedInterface([links_interface, validation_interface], ["articles link generation", "validation"]) | |
| # Run the app | |
| if __name__ == "__main__": | |
| app.launch() | |