import streamlit as st import pandas as pd import warnings import openpyxl warnings.filterwarnings('ignore') import requests import json from sklearn.metrics.pairwise import cosine_similarity import time from bs4 import BeautifulSoup as bs from googletrans import Translator import re import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.corpus import wordnet from nltk.stem import WordNetLemmatizer nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') nltk.download('stopwords') from sklearn.feature_extraction.text import TfidfVectorizer import joblib @st.cache_data def load_data(): return pd.read_excel('final_data.xlsx',index_col=0) data = load_data() @st.cache_data def load_model(): return joblib.load('SVM.pkl') model = load_model() ############################ st.header('1️⃣ 책 제목을 입력해주세요.') book_title = st.text_input(label = '예시) 날씨가 좋으면 찾아가겠어요',value="",key='text') def reset(): st.session_state.text = "" reset = st.button('Reset',on_click=reset) if not book_title: con = st.container() con.caption('Result') con.error('책 제목을 입력해주세요.',icon="⚠️") st.stop() rest_api_key = "41d651c93152d5ec054dc828cacfa671" url = "https://dapi.kakao.com/v3/search/book" header = {"authorization": "KakaoAK "+rest_api_key} querynum = {"query": book_title} try: response = requests.get(url, headers=header, params = querynum) content = response.text 책정보 = json.loads(content)['documents'][0] except: con = st.container() con.caption('Result') con.error('존재하지 않는 책입니다. 다시 입력해주세요.',icon="🚨") st.stop() book = pd.DataFrame({'title': 책정보['title'], 'isbn': 책정보['isbn'], 'authors': 책정보['authors'], 'publisher': 책정보['publisher']}) target_url = 책정보['url'] response = requests.get(target_url) soup = bs(response.text, "html.parser") 책소개 = soup.select('#tabContent > div:nth-child(1) > div:nth-child(3) > p') 책속으로 = soup.select('#tabContent > div:nth-child(1) > div:nth-child(6) > p') 서평 = soup.select('#tabContent > div:nth-child(1) > div:nth-child(7) > p') 책소개 = 책소개[0].text 책속으로 = 책속으로[0].text 서평 = 서평[0].text book['책소개'] = 책소개 book['책속으로'] = 책속으로 book['서평'] = 서평 img= soup.select('#tabContent > div:nth-child(1) > div.info_section.info_intro > div.wrap_thumb > span > img') img_src = img[0]['src'] col1, col2 = st.columns([1,2]) with col1: st.image(img_src,width=150) with col2: title = book['title'][0] author = book['authors'][0] publisher = book['publisher'][0] st.caption('제목 : '+ title) st.caption('저자 : '+ author) st.caption('출산사 : '+publisher) st.title('') text = '<'+title +'>에 대한 정보를 모으고 있는 중입니다.' my_bar = st.progress(0, text=text) time.sleep(5) my_bar.progress(5, text='〰️5%〰️') time.sleep(1) my_bar.progress(30, text='〰️30%〰️') #영어 불용어 사전 stops = set(stopwords.words('english')) def hapus_url(text): mention_pattern = r'@[\w]+' cleaned_text = re.sub(mention_pattern, '', text) return re.sub(r'http\S+','', cleaned_text) #특수문자 제거 #영어 대소문자, 숫자, 공백문자(스페이스, 탭, 줄바꿈 등) 아닌 문자들 제거 def remove_special_characters(text, remove_digits=True): text=re.sub(r'[^a-zA-Z0-9\s]', '', text) return text #불용어 제거 def delete_stops(text): text = text.lower().split() text = ' '.join([word for word in text if word not in stops]) return text #품사 tag 매칭용 함수 def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN #품사 태깅 + 표제어 추출 def tockenize(text): tokens=word_tokenize(text) pos_tokens=nltk.pos_tag(tokens) del tokens text_t=list() for _ in pos_tokens: text_t.append([_[0], get_wordnet_pos(_[1])]) del pos_tokens lemmatizer = WordNetLemmatizer() text = ' '.join([lemmatizer.lemmatize(word[0], word[1]) for word in text_t]) del lemmatizer return text def clean(text): text = remove_special_characters(text, remove_digits=True) text = delete_stops(text) text = tockenize(text) return text translator = Translator() for col in ['책소개', '책속으로', '서평']: name = col+'_trans' if book[col].values == '': book[name] = '' continue book[name] = clean(translator.translate(hapus_url(book.loc[0, col])).text) del stops del translator total_text = book.loc[0, '책소개_trans'] + book.loc[0, '책속으로_trans'] + book.loc[0, '서평_trans'] long = book.loc[0, '책소개'] + book.loc[0, '책속으로'] + book.loc[0, '서평'] del book @st.cache_data def load_tweet(): return pd.read_csv('tweet_data_agumentation.csv', index_col = 0) df = load_tweet() tfidf_vect_emo = TfidfVectorizer() tfidf_vect_emo.fit_transform(df["content"]) del df total_text2 = tfidf_vect_emo.transform(pd.Series(total_text)) model.predict_proba(total_text2) sentiment = pd.DataFrame(model.predict_proba(total_text2),index=['prob']).T sentiment['감정'] = ['empty','sadness','enthusiasm','worry','love','fun','hate','happiness','boredom','relief','anger'] del tfidf_vect_emo del model my_bar.progress(60, text='〰️60%〰️') # audio feature랑 text 감정 audio_data = data.iloc[:,-12:-1] sentiment_prob = sentiment['prob'] sentiment_prob.index = sentiment['감정'] audio_data.columns = ['empty', 'sadness', 'enthusiasm', 'worry', 'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'] audio_data_1 = pd.concat([sentiment_prob,audio_data.T],axis=1).T col = ['book']+list(data['name']) cosine_sim_audio = cosine_similarity(audio_data_1) cosine_sim_audio_df = pd.DataFrame(cosine_sim_audio, index = col, columns=col) audio_sim = cosine_sim_audio_df['book'] del audio_data del cosine_sim_audio del cosine_sim_audio_df # 가사랑 text lyrics_data = data.iloc[:,5:-12] lyrics_data_1 = pd.concat([sentiment_prob,lyrics_data.T],axis=1).T cosine_sim_lyrics = cosine_similarity(lyrics_data_1) cosine_sim_lyrics_df = pd.DataFrame(cosine_sim_lyrics, index =col, columns=col) lyrics_sim = cosine_sim_lyrics_df['book'] del lyrics_data del lyrics_data_1 del cosine_sim_lyrics del cosine_sim_lyrics_df del sentiment_prob my_bar.progress(80, text='〰️80%〰️') # 키워드랑 text keyword_data = data['key_word'] book_song_cont1 = pd.DataFrame({"text": total_text}, index = range(1)) book_song_cont2 = pd.DataFrame({"text": keyword_data}) keyword_data_1 = pd.concat([book_song_cont1, book_song_cont2], axis=0).reset_index(drop=True) tfidf_vect_cont = TfidfVectorizer() tfidf_matrix_cont = tfidf_vect_cont.fit_transform(keyword_data_1['text']) tfidf_array_cont = tfidf_matrix_cont.toarray() cosine_sim_keyword = cosine_similarity(tfidf_array_cont) cosine_sim_keyword_df = pd.DataFrame(cosine_sim_keyword, index = col, columns=col) keyword_sim = cosine_sim_keyword_df['book'] del total_text del keyword_data del book_song_cont1 del book_song_cont2 del keyword_data_1 del tfidf_vect_cont del tfidf_matrix_cont del tfidf_array_cont del cosine_sim_keyword del cosine_sim_keyword_df my_bar.progress(100, text='100%') # 전체 유사도 계산 total_sim = 0.8*audio_sim + 0.1*lyrics_sim + 0.1*keyword_sim total_sim_df = pd.DataFrame(total_sim[1:]) total_sim_df = total_sim_df.reset_index() total_sim_df.columns = ['name','book'] top_five = total_sim_df.sort_values(by='book',ascending=False)[:5] index = total_sim_df.sort_values(by='book',ascending=False)[:5].index.sort_values() del total_sim del total_sim_df artist = data.iloc[index][['url','name','Artist']] top_five_df = pd.merge(artist,top_five,on='name').sort_values(by='book',ascending=False).drop_duplicates() del artist del top_five total_sim = 0*audio_sim + 0.5*lyrics_sim + 0.5*keyword_sim total_sim_df_1 = pd.DataFrame(total_sim[1:]) total_sim_df_1 = total_sim_df_1.reset_index() total_sim_df_1.columns = ['name','book'] top_five_1 = total_sim_df_1.sort_values(by='book',ascending=False)[:5] index_1 = total_sim_df_1.sort_values(by='book',ascending=False)[:5].index.sort_values() del total_sim del total_sim_df_1 artist = data.iloc[index_1][['url','name','Artist']] top_five_df_1 = pd.merge(artist,top_five_1,on='name').sort_values(by='book',ascending=False).drop_duplicates() del artist del top_five_1 del data time.sleep(1) my_bar.empty() st.caption('책 소개 중....') st.markdown(long[:300]+'...') st.markdown('') lyrics_list = [] for i in top_five_df['url']: lyrics_list.append(lyrics[i== lyrics['url']]['lyrics'].values[0]) for i in top_five_df_1['url']: lyrics_list.append(lyrics[i== lyrics['url']]['lyrics'].values[0]) lyrics_eng_list = [] for i in top_five_df['url']: lyrics_eng_list.append(lyrics[i== lyrics['url']]['lyrics_english'].values[0]) for i in top_five_df_1['url']: lyrics_eng_list.append(lyrics[i== lyrics['url']]['lyrics_english'].values[0]) del lyrics st.header('2️⃣ 결과') st.subheader('🙂 노래와 분위기가 유사한 노래') st.caption('AF : 가사 : 키워드 = 0.8 : 0.1 : 0.1') tab1, tab2, tab3, tab4, tab5= st.tabs(['TOP 1' , 'TOP 2', 'TOP 3', 'TOP 4', 'TOP 5']) with tab1: st.subheader('🥇 TOP 1') st.markdown('**제목** : {0}'.format(top_five_df.iloc[0]['name'])) st.markdown('**가수** : {0} '.format(top_five_df.iloc[0]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df.iloc[0]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df.iloc[0]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[0]) st.caption('영어 ver') st.markdown(lyrics_eng_list[0]) st.markdown('') with tab2: st.subheader('🥈 TOP 2') st.markdown('**제목** : {0}'.format(top_five_df.iloc[1]['name'])) st.markdown('**가수** : {0} '.format(top_five_df.iloc[1]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df.iloc[1]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df.iloc[1]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[1]) st.caption('영어 ver') st.markdown(lyrics_eng_list[1]) st.markdown('') with tab3: st.subheader('🥉 TOP 3') st.markdown('**제목** : {0}'.format(top_five_df.iloc[2]['name'])) st.markdown('**가수** : {0} '.format(top_five_df.iloc[2]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df.iloc[2]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df.iloc[2]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[2]) st.caption('영어 ver') st.markdown(lyrics_eng_list[2]) st.markdown('') with tab4: st.subheader('TOP 4') st.markdown('**제목** : {0}'.format(top_five_df.iloc[3]['name'])) st.markdown('**가수** : {0} '.format(top_five_df.iloc[3]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df.iloc[3]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df.iloc[3]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[3]) st.caption('영어 ver') st.markdown(lyrics_eng_list[3]) st.markdown('') with tab5: st.subheader('TOP 5') st.markdown('**제목** : {0}'.format(top_five_df.iloc[4]['name'])) st.markdown('**가수** : {0} '.format(top_five_df.iloc[4]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df.iloc[4]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df.iloc[4]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[4]) st.caption('영어 ver') st.markdown(lyrics_eng_list[4]) st.subheader('📖 노래와 내용이 유사한 노래') st.caption('AF : 가사 : 키워드 = 0 : 0.5 : 0.5') tab1, tab2, tab3, tab4, tab5= st.tabs(['TOP 1' , 'TOP 2', 'TOP 3', 'TOP 4', 'TOP 5']) with tab1: st.subheader('🥇 TOP 1') st.markdown('**제목** : {0}'.format(top_five_df_1.iloc[0]['name'])) st.markdown('**가수** : {0} '.format(top_five_df_1.iloc[0]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df_1.iloc[0]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df_1.iloc[0]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[5]) st.caption('영어 ver') st.markdown(lyrics_eng_list[5]) st.markdown('') with tab2: st.subheader('🥈 TOP 2') st.markdown('**제목** : {0}'.format(top_five_df_1.iloc[1]['name'])) st.markdown('**가수** : {0} '.format(top_five_df_1.iloc[1]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df_1.iloc[1]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df_1.iloc[1]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[6]) st.caption('영어 ver') st.markdown(lyrics_eng_list[6]) st.markdown('') with tab3: st.subheader('🥉 TOP 3') st.markdown('**제목** : {0}'.format(top_five_df_1.iloc[2]['name'])) st.markdown('**가수** : {0} '.format(top_five_df_1.iloc[2]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df_1.iloc[2]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df_1.iloc[2]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[7]) st.caption('영어 ver') st.markdown(lyrics_eng_list[7]) st.markdown('') with tab4: st.subheader('TOP 4') st.markdown('**제목** : {0}'.format(top_five_df_1.iloc[3]['name'])) st.markdown('**가수** : {0} '.format(top_five_df_1.iloc[3]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df_1.iloc[3]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df_1.iloc[3]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[8]) st.caption('영어 ver') st.markdown(lyrics_eng_list[8]) st.markdown('') with tab5: st.subheader('TOP 5') st.markdown('**제목** : {0}'.format(top_five_df_1.iloc[4]['name'])) st.markdown('**가수** : {0} '.format(top_five_df_1.iloc[4]['Artist'])) st.markdown('**url** : {0} '.format(top_five_df_1.iloc[4]['url'])) st.markdown('**유사도** : {0:.4f}'.format(top_five_df_1.iloc[4]['book'])) with st.expander('가사'): st.caption('원본 ver') st.markdown(lyrics_list[9]) st.caption('영어 ver') st.markdown(lyrics_eng_list[9])