Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import warnings | |
| import openpyxl | |
| warnings.filterwarnings('ignore') | |
| import requests | |
| import json | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import time | |
| from bs4 import BeautifulSoup as bs | |
| from googletrans import Translator | |
| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import wordnet | |
| from nltk.stem import WordNetLemmatizer | |
| nltk.download('punkt') | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('wordnet') | |
| nltk.download('stopwords') | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import joblib | |
| def load_data(): | |
| return pd.read_excel('final_data.xlsx',index_col=0) | |
| data = load_data() | |
| def load_model(): | |
| return joblib.load('SVM.pkl') | |
| model = load_model() | |
| ############################ | |
| st.header('1οΈβ£ μ± μ λͺ©μ μ λ ₯ν΄μ£ΌμΈμ.') | |
| book_title = st.text_input(label = 'μμ) λ μ¨κ° μ’μΌλ©΄ μ°Ύμκ°κ² μ΄μ',value="",key='text') | |
| def reset(): | |
| st.session_state.text = "" | |
| reset = st.button('Reset',on_click=reset) | |
| if not book_title: | |
| con = st.container() | |
| con.caption('Result') | |
| con.error('μ± μ λͺ©μ μ λ ₯ν΄μ£ΌμΈμ.',icon="β οΈ") | |
| st.stop() | |
| rest_api_key = "41d651c93152d5ec054dc828cacfa671" | |
| url = "https://dapi.kakao.com/v3/search/book" | |
| header = {"authorization": "KakaoAK "+rest_api_key} | |
| querynum = {"query": book_title} | |
| try: | |
| response = requests.get(url, headers=header, params = querynum) | |
| content = response.text | |
| μ± μ 보 = json.loads(content)['documents'][0] | |
| except: | |
| con = st.container() | |
| con.caption('Result') | |
| con.error('μ‘΄μ¬νμ§ μλ μ± μ λλ€. λ€μ μ λ ₯ν΄μ£ΌμΈμ.',icon="π¨") | |
| st.stop() | |
| book = pd.DataFrame({'title': μ± μ 보['title'], | |
| 'isbn': μ± μ 보['isbn'], | |
| 'authors': μ± μ 보['authors'], | |
| 'publisher': μ± μ 보['publisher']}) | |
| target_url = μ± μ 보['url'] | |
| response = requests.get(target_url) | |
| soup = bs(response.text, "html.parser") | |
| μ± μκ° = soup.select('#tabContent > div:nth-child(1) > div:nth-child(3) > p') | |
| μ± μμΌλ‘ = soup.select('#tabContent > div:nth-child(1) > div:nth-child(6) > p') | |
| μν = soup.select('#tabContent > div:nth-child(1) > div:nth-child(7) > p') | |
| μ± μκ° = μ± μκ°[0].text | |
| μ± μμΌλ‘ = μ± μμΌλ‘[0].text | |
| μν = μν[0].text | |
| book['μ± μκ°'] = μ± μκ° | |
| book['μ± μμΌλ‘'] = μ± μμΌλ‘ | |
| book['μν'] = μν | |
| img= soup.select('#tabContent > div:nth-child(1) > div.info_section.info_intro > div.wrap_thumb > span > img') | |
| img_src = img[0]['src'] | |
| col1, col2 = st.columns([1,2]) | |
| with col1: | |
| st.image(img_src,width=150) | |
| with col2: | |
| title = book['title'][0] | |
| author = book['authors'][0] | |
| publisher = book['publisher'][0] | |
| st.caption('μ λͺ© : '+ title) | |
| st.caption('μ μ : '+ author) | |
| st.caption('μΆμ°μ¬ : '+publisher) | |
| st.title('') | |
| text = '<'+title +'>μ λν μ 보λ₯Ό λͺ¨μΌκ³ μλ μ€μ λλ€.' | |
| my_bar = st.progress(0, text=text) | |
| time.sleep(5) | |
| my_bar.progress(5, text='γ°οΈ5%γ°οΈ') | |
| time.sleep(1) | |
| my_bar.progress(30, text='γ°οΈ30%γ°οΈ') | |
| #μμ΄ λΆμ©μ΄ μ¬μ | |
| stops = set(stopwords.words('english')) | |
| def hapus_url(text): | |
| mention_pattern = r'@[\w]+' | |
| cleaned_text = re.sub(mention_pattern, '', text) | |
| return re.sub(r'http\S+','', cleaned_text) | |
| #νΉμλ¬Έμ μ κ±° | |
| #μμ΄ λμλ¬Έμ, μ«μ, 곡백문μ(μ€νμ΄μ€, ν, μ€λ°κΏ λ±) μλ λ¬Έμλ€ μ κ±° | |
| def remove_special_characters(text, remove_digits=True): | |
| text=re.sub(r'[^a-zA-Z0-9\s]', '', text) | |
| return text | |
| #λΆμ©μ΄ μ κ±° | |
| def delete_stops(text): | |
| text = text.lower().split() | |
| text = ' '.join([word for word in text if word not in stops]) | |
| return text | |
| #νμ¬ tag λ§€μΉμ© ν¨μ | |
| def get_wordnet_pos(treebank_tag): | |
| if treebank_tag.startswith('J'): | |
| return wordnet.ADJ | |
| elif treebank_tag.startswith('V'): | |
| return wordnet.VERB | |
| elif treebank_tag.startswith('N'): | |
| return wordnet.NOUN | |
| elif treebank_tag.startswith('R'): | |
| return wordnet.ADV | |
| else: | |
| return wordnet.NOUN | |
| #νμ¬ νκΉ + νμ μ΄ μΆμΆ | |
| def tockenize(text): | |
| tokens=word_tokenize(text) | |
| pos_tokens=nltk.pos_tag(tokens) | |
| del tokens | |
| text_t=list() | |
| for _ in pos_tokens: | |
| text_t.append([_[0], get_wordnet_pos(_[1])]) | |
| del pos_tokens | |
| lemmatizer = WordNetLemmatizer() | |
| text = ' '.join([lemmatizer.lemmatize(word[0], word[1]) for word in text_t]) | |
| del lemmatizer | |
| return text | |
| def clean(text): | |
| text = remove_special_characters(text, remove_digits=True) | |
| text = delete_stops(text) | |
| text = tockenize(text) | |
| return text | |
| translator = Translator() | |
| for col in ['μ± μκ°', 'μ± μμΌλ‘', 'μν']: | |
| name = col+'_trans' | |
| if book[col].values == '': | |
| book[name] = '' | |
| continue | |
| book[name] = clean(translator.translate(hapus_url(book.loc[0, col])).text) | |
| del stops | |
| del translator | |
| total_text = book.loc[0, 'μ± μκ°_trans'] + book.loc[0, 'μ± μμΌλ‘_trans'] + book.loc[0, 'μν_trans'] | |
| long = book.loc[0, 'μ± μκ°'] + book.loc[0, 'μ± μμΌλ‘'] + book.loc[0, 'μν'] | |
| del book | |
| def load_tweet(): | |
| return pd.read_csv('tweet_data_agumentation.csv', index_col = 0) | |
| df = load_tweet() | |
| tfidf_vect_emo = TfidfVectorizer() | |
| tfidf_vect_emo.fit_transform(df["content"]) | |
| del df | |
| total_text2 = tfidf_vect_emo.transform(pd.Series(total_text)) | |
| model.predict_proba(total_text2) | |
| sentiment = pd.DataFrame(model.predict_proba(total_text2),index=['prob']).T | |
| sentiment['κ°μ '] = ['empty','sadness','enthusiasm','worry','love','fun','hate','happiness','boredom','relief','anger'] | |
| del tfidf_vect_emo | |
| del model | |
| my_bar.progress(60, text='γ°οΈ60%γ°οΈ') | |
| # audio featureλ text κ°μ | |
| audio_data = data.iloc[:,-12:-1] | |
| sentiment_prob = sentiment['prob'] | |
| sentiment_prob.index = sentiment['κ°μ '] | |
| audio_data.columns = ['empty', 'sadness', 'enthusiasm', 'worry', 'love', 'fun', 'hate', | |
| 'happiness', 'boredom', 'relief', 'anger'] | |
| audio_data_1 = pd.concat([sentiment_prob,audio_data.T],axis=1).T | |
| col = ['book']+list(data['name']) | |
| cosine_sim_audio = cosine_similarity(audio_data_1) | |
| cosine_sim_audio_df = pd.DataFrame(cosine_sim_audio, index = col, columns=col) | |
| audio_sim = cosine_sim_audio_df['book'] | |
| del audio_data | |
| del cosine_sim_audio | |
| del cosine_sim_audio_df | |
| # κ°μ¬λ text | |
| lyrics_data = data.iloc[:,5:-12] | |
| lyrics_data_1 = pd.concat([sentiment_prob,lyrics_data.T],axis=1).T | |
| cosine_sim_lyrics = cosine_similarity(lyrics_data_1) | |
| cosine_sim_lyrics_df = pd.DataFrame(cosine_sim_lyrics, index =col, columns=col) | |
| lyrics_sim = cosine_sim_lyrics_df['book'] | |
| del lyrics_data | |
| del lyrics_data_1 | |
| del cosine_sim_lyrics | |
| del cosine_sim_lyrics_df | |
| del sentiment_prob | |
| my_bar.progress(80, text='γ°οΈ80%γ°οΈ') | |
| # ν€μλλ text | |
| keyword_data = data['key_word'] | |
| book_song_cont1 = pd.DataFrame({"text": total_text}, index = range(1)) | |
| book_song_cont2 = pd.DataFrame({"text": keyword_data}) | |
| keyword_data_1 = pd.concat([book_song_cont1, book_song_cont2], axis=0).reset_index(drop=True) | |
| tfidf_vect_cont = TfidfVectorizer() | |
| tfidf_matrix_cont = tfidf_vect_cont.fit_transform(keyword_data_1['text']) | |
| tfidf_array_cont = tfidf_matrix_cont.toarray() | |
| cosine_sim_keyword = cosine_similarity(tfidf_array_cont) | |
| cosine_sim_keyword_df = pd.DataFrame(cosine_sim_keyword, index = col, columns=col) | |
| keyword_sim = cosine_sim_keyword_df['book'] | |
| del total_text | |
| del keyword_data | |
| del book_song_cont1 | |
| del book_song_cont2 | |
| del keyword_data_1 | |
| del tfidf_vect_cont | |
| del tfidf_matrix_cont | |
| del tfidf_array_cont | |
| del cosine_sim_keyword | |
| del cosine_sim_keyword_df | |
| my_bar.progress(100, text='100%') | |
| # μ 체 μ μ¬λ κ³μ° | |
| total_sim = 0.8*audio_sim + 0.1*lyrics_sim + 0.1*keyword_sim | |
| total_sim_df = pd.DataFrame(total_sim[1:]) | |
| total_sim_df = total_sim_df.reset_index() | |
| total_sim_df.columns = ['name','book'] | |
| top_five = total_sim_df.sort_values(by='book',ascending=False)[:5] | |
| index = total_sim_df.sort_values(by='book',ascending=False)[:5].index.sort_values() | |
| del total_sim | |
| del total_sim_df | |
| artist = data.iloc[index][['url','name','Artist']] | |
| top_five_df = pd.merge(artist,top_five,on='name').sort_values(by='book',ascending=False).drop_duplicates() | |
| del artist | |
| del top_five | |
| total_sim = 0*audio_sim + 0.5*lyrics_sim + 0.5*keyword_sim | |
| total_sim_df_1 = pd.DataFrame(total_sim[1:]) | |
| total_sim_df_1 = total_sim_df_1.reset_index() | |
| total_sim_df_1.columns = ['name','book'] | |
| top_five_1 = total_sim_df_1.sort_values(by='book',ascending=False)[:5] | |
| index_1 = total_sim_df_1.sort_values(by='book',ascending=False)[:5].index.sort_values() | |
| del total_sim | |
| del total_sim_df_1 | |
| artist = data.iloc[index_1][['url','name','Artist']] | |
| top_five_df_1 = pd.merge(artist,top_five_1,on='name').sort_values(by='book',ascending=False).drop_duplicates() | |
| del artist | |
| del top_five_1 | |
| del data | |
| time.sleep(1) | |
| my_bar.empty() | |
| st.caption('μ± μκ° μ€....') | |
| st.markdown(long[:300]+'...') | |
| st.markdown('') | |
| lyrics_list = [] | |
| for i in top_five_df['url']: | |
| lyrics_list.append(lyrics[i== lyrics['url']]['lyrics'].values[0]) | |
| for i in top_five_df_1['url']: | |
| lyrics_list.append(lyrics[i== lyrics['url']]['lyrics'].values[0]) | |
| lyrics_eng_list = [] | |
| for i in top_five_df['url']: | |
| lyrics_eng_list.append(lyrics[i== lyrics['url']]['lyrics_english'].values[0]) | |
| for i in top_five_df_1['url']: | |
| lyrics_eng_list.append(lyrics[i== lyrics['url']]['lyrics_english'].values[0]) | |
| del lyrics | |
| st.header('2οΈβ£ κ²°κ³Ό') | |
| st.subheader('π λ Έλμ λΆμκΈ°κ° μ μ¬ν λ Έλ') | |
| st.caption('AF : κ°μ¬ : ν€μλ = 0.8 : 0.1 : 0.1') | |
| tab1, tab2, tab3, tab4, tab5= st.tabs(['TOP 1' , 'TOP 2', 'TOP 3', 'TOP 4', 'TOP 5']) | |
| with tab1: | |
| st.subheader('π₯ TOP 1') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df.iloc[0]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df.iloc[0]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df.iloc[0]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df.iloc[0]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[0]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[0]) | |
| st.markdown('') | |
| with tab2: | |
| st.subheader('π₯ TOP 2') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df.iloc[1]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df.iloc[1]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df.iloc[1]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df.iloc[1]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[1]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[1]) | |
| st.markdown('') | |
| with tab3: | |
| st.subheader('π₯ TOP 3') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df.iloc[2]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df.iloc[2]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df.iloc[2]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df.iloc[2]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[2]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[2]) | |
| st.markdown('') | |
| with tab4: | |
| st.subheader('TOP 4') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df.iloc[3]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df.iloc[3]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df.iloc[3]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df.iloc[3]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[3]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[3]) | |
| st.markdown('') | |
| with tab5: | |
| st.subheader('TOP 5') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df.iloc[4]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df.iloc[4]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df.iloc[4]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df.iloc[4]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[4]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[4]) | |
| st.subheader('π λ Έλμ λ΄μ©μ΄ μ μ¬ν λ Έλ') | |
| st.caption('AF : κ°μ¬ : ν€μλ = 0 : 0.5 : 0.5') | |
| tab1, tab2, tab3, tab4, tab5= st.tabs(['TOP 1' , 'TOP 2', 'TOP 3', 'TOP 4', 'TOP 5']) | |
| with tab1: | |
| st.subheader('π₯ TOP 1') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df_1.iloc[0]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df_1.iloc[0]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df_1.iloc[0]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df_1.iloc[0]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[5]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[5]) | |
| st.markdown('') | |
| with tab2: | |
| st.subheader('π₯ TOP 2') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df_1.iloc[1]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df_1.iloc[1]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df_1.iloc[1]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df_1.iloc[1]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[6]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[6]) | |
| st.markdown('') | |
| with tab3: | |
| st.subheader('π₯ TOP 3') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df_1.iloc[2]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df_1.iloc[2]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df_1.iloc[2]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df_1.iloc[2]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[7]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[7]) | |
| st.markdown('') | |
| with tab4: | |
| st.subheader('TOP 4') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df_1.iloc[3]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df_1.iloc[3]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df_1.iloc[3]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df_1.iloc[3]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[8]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[8]) | |
| st.markdown('') | |
| with tab5: | |
| st.subheader('TOP 5') | |
| st.markdown('**μ λͺ©** : {0}'.format(top_five_df_1.iloc[4]['name'])) | |
| st.markdown('**κ°μ** : {0} '.format(top_five_df_1.iloc[4]['Artist'])) | |
| st.markdown('**url** : {0} '.format(top_five_df_1.iloc[4]['url'])) | |
| st.markdown('**μ μ¬λ** : {0:.4f}'.format(top_five_df_1.iloc[4]['book'])) | |
| with st.expander('κ°μ¬'): | |
| st.caption('μλ³Έ ver') | |
| st.markdown(lyrics_list[9]) | |
| st.caption('μμ΄ ver') | |
| st.markdown(lyrics_eng_list[9]) | |