Spaces:
Sleeping
Sleeping
File size: 3,725 Bytes
d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 2562fe1 d2b12b1 43991fd 2562fe1 43991fd d2b12b1 43991fd d2b12b1 43991fd d2b12b1 2562fe1 43991fd d2b12b1 43991fd 2562fe1 d2b12b1 2562fe1 43991fd d2b12b1 43991fd 2562fe1 d2b12b1 2562fe1 43991fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# -*- coding: utf-8 -*-
"""keyword_extraction"""
import requests
import jieba
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer
import streamlit as st
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
# 下載字體
def download_font(url, save_path):
response = requests.get(url)
with open(save_path, 'wb') as f:
f.write(response.content)
# 字體URL和保存路徑
font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
font_path = 'TaipeiSansTCBeta-Regular.ttf'
# 下載字體
download_font(font_url, font_path)
# 設置字體
font_prop = FontProperties(fname=font_path)
# 定義斷詞函數
def jieba_tokenizer(text):
return jieba.lcut(text)
# 初始化CountVectorizer並定義KeyBERT模型
vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
kw_model = KeyBERT()
# 提取關鍵詞的函數
def extract_keywords(doc):
keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer)
return keywords
# 畫圖函數
def plot_keywords(keywords, title):
words = [kw[0] for kw in keywords]
scores = [kw[1] for kw in keywords]
plt.figure(figsize=(10, 6))
plt.barh(words, scores, color='skyblue')
plt.xlabel('分數', fontproperties=font_prop)
plt.title(title, fontproperties=font_prop)
plt.gca().invert_yaxis() # 反轉Y軸,使得分數最高的關鍵詞在最上面
plt.xticks(fontproperties=font_prop)
plt.yticks(fontproperties=font_prop)
st.pyplot(plt)
# 自定義CSS
st.markdown(
"""
<style>
.main {
background-color: #f0f2f6;
padding: 2rem;
border-radius: 10px;
}
.title {
font-size: 2.5rem;
color: #4b8bbe;
text-align: center;
margin-bottom: 1.5rem;
}
.textarea {
font-size: 1.2rem;
}
.button {
background-color: #4b8bbe;
color: white;
font-size: 1.2rem;
padding: 0.5rem 1rem;
border-radius: 5px;
margin-top: 1rem;
margin-bottom: 2rem;
}
.keywords {
font-size: 1.5rem;
color: #333;
margin-top: 2rem;
}
.keyword-item {
font-size: 1.2rem;
margin: 0.5rem 0;
}
</style>
""",
unsafe_allow_html=True
)
# 建立Streamlit網頁應用程式
st.markdown('<div class="main">', unsafe_allow_html=True)
st.markdown('<div class="title">中文關鍵詞提取工具</div>', unsafe_allow_html=True)
doc = st.text_area("請輸入文章:", height=200, key="input_text")
if st.button("提取關鍵詞", key="extract_button"):
if doc:
keywords = extract_keywords(doc)
st.markdown('<div class="keywords">關鍵詞提取結果:</div>', unsafe_allow_html=True)
for keyword in keywords:
st.markdown(f'<div class="keyword-item">{keyword[0]}: {keyword[1]:.4f}</div>', unsafe_allow_html=True)
plot_keywords(keywords, "關鍵詞提取結果")
# 使用另一個模型進行關鍵詞提取
kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
keywords_multilingual = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer)
st.markdown('<div class="keywords">多語言模型關鍵詞提取結果:</div>', unsafe_allow_html=True)
for keyword in keywords_multilingual:
st.markdown(f'<div class="keyword-item">{keyword[0]}: {keyword[1]:.4f}</div>', unsafe_allow_html=True)
plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
else:
st.write("請輸入文章內容以進行關鍵詞提取。")
st.markdown('</div>', unsafe_allow_html=True)
|