Spaces:
Sleeping
Sleeping
| import requests | |
| import jieba | |
| from keybert import KeyBERT | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| import matplotlib.pyplot as plt | |
| from matplotlib.font_manager import FontProperties | |
| from bs4 import BeautifulSoup | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import streamlit as st | |
| import io | |
| from wordcloud import WordCloud | |
| # 下載字體 | |
| def download_font(url, save_path): | |
| response = requests.get(url) | |
| with open(save_path, 'wb') as f: | |
| f.write(response.content) | |
| # 字體URL和保存路徑 | |
| font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download' | |
| font_path = 'TaipeiSansTCBeta-Regular.ttf' | |
| # 下載字體 | |
| download_font(font_url, font_path) | |
| # 設置字體 | |
| font_prop = FontProperties(fname=font_path) | |
| # 定義斷詞函數 | |
| def jieba_tokenizer(text): | |
| return jieba.lcut(text) | |
| # 初始化 CountVectorizer 和 KeyBERT 模型 | |
| vectorizer = CountVectorizer(tokenizer=jieba_tokenizer) | |
| kw_model = KeyBERT() | |
| sentence_model = SentenceTransformer('distiluse-base-multilingual-cased-v1') | |
| # Maximal Marginal Relevance (MMR) 函數 | |
| def mmr(keywords, doc_embedding, query_embedding, lambda_param=0.5): | |
| keyword_embeddings = np.array([sentence_model.encode(kw[0]) for kw in keywords]) | |
| keyword_scores = np.array([kw[1] for kw in keywords]) | |
| doc_similarity = cosine_similarity(keyword_embeddings, doc_embedding.reshape(1, -1)).flatten() | |
| query_similarity = cosine_similarity(keyword_embeddings, query_embedding.reshape(1, -1)).flatten() | |
| mmr_scores = (1 - lambda_param) * doc_similarity + lambda_param * query_similarity | |
| sorted_indices = np.argsort(mmr_scores)[::-1] | |
| sorted_keywords = [keywords[i] for i in sorted_indices] | |
| return sorted_keywords | |
| # 提取關鍵詞的函數 | |
| def extract_keywords(doc, lambda_param=0.5): | |
| keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer) | |
| doc_embedding = sentence_model.encode(doc) | |
| query_embedding = sentence_model.encode(' '.join([kw[0] for kw in keywords])) | |
| optimized_keywords = mmr(keywords, doc_embedding, query_embedding, lambda_param=lambda_param) | |
| return optimized_keywords | |
| # 畫圖函數 | |
| def plot_keywords(keywords, title): | |
| words = [kw[0] for kw in keywords] | |
| scores = [kw[1] for kw in keywords] | |
| plt.figure(figsize=(10, 6)) | |
| plt.barh(words, scores, color='skyblue') | |
| plt.xlabel('分數', fontproperties=font_prop) | |
| plt.title(title, fontproperties=font_prop) | |
| plt.gca().invert_yaxis() # 反轉 Y 軸,使得分數最高的關鍵詞在最上面 | |
| plt.xticks(fontproperties=font_prop) | |
| plt.yticks(fontproperties=font_prop) | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format='png') | |
| buf.seek(0) | |
| plt.close() | |
| return buf | |
| # 生成文字雲的函數 | |
| def generate_wordcloud(text): | |
| # 使用 jieba 進行分詞 | |
| words = jieba.lcut(text) | |
| # 使用所有詞語生成文字雲 | |
| wordcloud_text = ' '.join(words) | |
| # 生成文字雲 | |
| wordcloud = WordCloud(font_path=font_path, width=800, height=400, | |
| background_color='white').generate(wordcloud_text) | |
| # 繪製文字雲 | |
| plt.figure(figsize=(10, 5)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format='png') | |
| buf.seek(0) | |
| plt.close() | |
| return buf | |
| # 抓取 Yahoo 新聞文章內容的函數 | |
| def fetch_article(url): | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| article_body = soup.find('div', {'class': 'caas-body'}) | |
| if article_body: | |
| paragraphs = article_body.find_all('p') | |
| article_text = '\n'.join(p.get_text() for p in paragraphs) | |
| return article_text | |
| else: | |
| return "未找到文章內容" | |
| # 建立 Streamlit 網頁應用程式 | |
| st.title("中文關鍵詞提取工具") | |
| # 輸入新聞網址 | |
| url = st.text_input("請輸入新聞網址:", value="https://tw.news.yahoo.com/%E5%8F%B0%E7%A9%8D%E9%9B%BB%E5%96%AE%E6%97%A5%E5%B8%82%E5%80%BC%E8%92%B8%E7%99%BC2-28%E5%85%86%E5%85%83%E5%AF%AB%E6%96%B0%E7%B4%80%E9%8C%84-800%E5%85%83%E6%8B%89%E8%AD%A6%E5%A0%B1-071946952.html") | |
| # 調整 MMR 的 lambda 參數 | |
| lambda_param = st.slider("調整關鍵字的多樣性和相關性 (λ):", min_value=0.0, max_value=1.0, value=0.5, step=0.1) | |
| if st.button("抓取並提取關鍵詞"): | |
| if url: | |
| # 抓取文章內容 | |
| doc = fetch_article(url) | |
| if doc and doc != "未找到文章內容": | |
| st.write("抓取的文章內容:") | |
| st.write(doc) | |
| # 提取關鍵詞 | |
| keywords = extract_keywords(doc, lambda_param=lambda_param) | |
| st.write("關鍵詞提取結果:") | |
| for keyword in keywords: | |
| st.write(f"{keyword[0]}: {keyword[1]:.4f}") | |
| plot_img = plot_keywords(keywords, "關鍵詞提取結果") | |
| st.image(plot_img, caption="關鍵詞提取結果圖") | |
| # 生成並顯示文字雲 | |
| wordcloud_img = generate_wordcloud(doc) | |
| st.image(wordcloud_img, caption="文章內容文字雲") | |
| # 使用另一個模型進行關鍵詞提取 | |
| kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1') | |
| keywords_multilingual = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer) | |
| st.write("多語言模型關鍵詞提取結果:") | |
| for keyword in keywords_multilingual: | |
| st.write(f"{keyword[0]}: {keyword[1]:.4f}") | |
| plot_img_multilingual = plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果") | |
| st.image(plot_img_multilingual, caption="多語言模型關鍵詞提取結果圖") | |
| else: | |
| st.write("無法抓取文章內容。") | |
| else: | |
| st.write("請輸入有效的新聞網址。") | |