KKingzor's picture
Update app.py
7922f96 verified
import requests
import jieba
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from bs4 import BeautifulSoup
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import io
from wordcloud import WordCloud
# 下載字體
def download_font(url, save_path):
response = requests.get(url)
with open(save_path, 'wb') as f:
f.write(response.content)
# 字體URL和保存路徑
font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
font_path = 'TaipeiSansTCBeta-Regular.ttf'
# 下載字體
download_font(font_url, font_path)
# 設置字體
font_prop = FontProperties(fname=font_path)
# 定義斷詞函數
def jieba_tokenizer(text):
return jieba.lcut(text)
# 初始化 CountVectorizer 和 KeyBERT 模型
vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
kw_model = KeyBERT()
sentence_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
# Maximal Marginal Relevance (MMR) 函數
def mmr(keywords, doc_embedding, query_embedding, lambda_param=0.5):
keyword_embeddings = np.array([sentence_model.encode(kw[0]) for kw in keywords])
keyword_scores = np.array([kw[1] for kw in keywords])
doc_similarity = cosine_similarity(keyword_embeddings, doc_embedding.reshape(1, -1)).flatten()
query_similarity = cosine_similarity(keyword_embeddings, query_embedding.reshape(1, -1)).flatten()
mmr_scores = (1 - lambda_param) * doc_similarity + lambda_param * query_similarity
sorted_indices = np.argsort(mmr_scores)[::-1]
sorted_keywords = [keywords[i] for i in sorted_indices]
return sorted_keywords
# 提取關鍵詞的函數
def extract_keywords(doc, lambda_param=0.5):
keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer)
doc_embedding = sentence_model.encode(doc)
query_embedding = sentence_model.encode(' '.join([kw[0] for kw in keywords]))
optimized_keywords = mmr(keywords, doc_embedding, query_embedding, lambda_param=lambda_param)
return optimized_keywords
# 畫圖函數
def plot_keywords(keywords, title):
words = [kw[0] for kw in keywords]
scores = [kw[1] for kw in keywords]
plt.figure(figsize=(10, 6))
plt.barh(words, scores, color='skyblue')
plt.xlabel('分數', fontproperties=font_prop)
plt.title(title, fontproperties=font_prop)
plt.gca().invert_yaxis() # 反轉 Y 軸,使得分數最高的關鍵詞在最上面
plt.xticks(fontproperties=font_prop)
plt.yticks(fontproperties=font_prop)
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close()
return buf
# 生成文字雲的函數
def generate_wordcloud(text):
# 使用 jieba 進行分詞
words = jieba.lcut(text)
# 使用所有詞語生成文字雲
wordcloud_text = ' '.join(words)
# 生成文字雲
wordcloud = WordCloud(font_path=font_path, width=800, height=400,
background_color='white').generate(wordcloud_text)
# 繪製文字雲
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
plt.close()
return buf
# 抓取 Yahoo 新聞文章內容的函數
def fetch_article(url):
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
article_body = soup.find('div', {'class': 'caas-body'})
if article_body:
paragraphs = article_body.find_all('p')
article_text = '\n'.join(p.get_text() for p in paragraphs)
return article_text
else:
return "未找到文章內容"
# 建立 Streamlit 網頁應用程式
st.title("中文關鍵詞提取工具")
# 輸入新聞網址
url = st.text_input("請輸入新聞網址:", value="https://tw.news.yahoo.com/%E5%8F%B0%E7%A9%8D%E9%9B%BB%E5%96%AE%E6%97%A5%E5%B8%82%E5%80%BC%E8%92%B8%E7%99%BC2-28%E5%85%86%E5%85%83%E5%AF%AB%E6%96%B0%E7%B4%80%E9%8C%84-800%E5%85%83%E6%8B%89%E8%AD%A6%E5%A0%B1-071946952.html")
# 調整 MMR 的 lambda 參數
lambda_param = st.slider("調整關鍵字的多樣性和相關性 (λ):", min_value=0.0, max_value=1.0, value=0.5, step=0.1)
if st.button("抓取並提取關鍵詞"):
if url:
# 抓取文章內容
doc = fetch_article(url)
if doc and doc != "未找到文章內容":
st.write("抓取的文章內容:")
st.write(doc)
# 提取關鍵詞
keywords = extract_keywords(doc, lambda_param=lambda_param)
st.write("關鍵詞提取結果:")
for keyword in keywords:
st.write(f"{keyword[0]}: {keyword[1]:.4f}")
plot_img = plot_keywords(keywords, "關鍵詞提取結果")
st.image(plot_img, caption="關鍵詞提取結果圖")
# 生成並顯示文字雲
wordcloud_img = generate_wordcloud(doc)
st.image(wordcloud_img, caption="文章內容文字雲")
# 使用另一個模型進行關鍵詞提取
kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
keywords_multilingual = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer)
st.write("多語言模型關鍵詞提取結果:")
for keyword in keywords_multilingual:
st.write(f"{keyword[0]}: {keyword[1]:.4f}")
plot_img_multilingual = plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
st.image(plot_img_multilingual, caption="多語言模型關鍵詞提取結果圖")
else:
st.write("無法抓取文章內容。")
else:
st.write("請輸入有效的新聞網址。")