sidcww commited on
Commit
a2ecf7a
·
verified ·
1 Parent(s): 908f94b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -36
app.py CHANGED
@@ -1,12 +1,15 @@
 
 
 
1
  import requests
 
 
2
  import jieba
3
  from keybert import KeyBERT
4
  from sklearn.feature_extraction.text import CountVectorizer
5
  import streamlit as st
6
  import matplotlib.pyplot as plt
7
  from matplotlib.font_manager import FontProperties
8
- from bs4 import BeautifulSoup
9
- import pandas as pd
10
 
11
  # 下載字體
12
  def download_font(url, save_path):
@@ -24,7 +27,16 @@ download_font(font_url, font_path)
24
  # 設置字體
25
  font_prop = FontProperties(fname=font_path)
26
 
27
- # 定義斷詞函數
 
 
 
 
 
 
 
 
 
28
  def jieba_tokenizer(text):
29
  return jieba.lcut(text)
30
 
@@ -41,55 +53,56 @@ def extract_keywords(doc):
41
  def plot_keywords(keywords, title):
42
  words = [kw[0] for kw in keywords]
43
  scores = [kw[1] for kw in keywords]
 
44
  plt.figure(figsize=(10, 6))
45
- plt.barh(words, scores, color='skyblue')
46
- plt.xlabel('分數', fontproperties=font_prop)
47
- plt.title(title, fontproperties=font_prop)
48
- plt.gca().invert_yaxis()
49
- plt.xticks(fontproperties=font_prop)
50
- plt.yticks(fontproperties=font_prop)
51
- st.pyplot(plt)
 
 
 
 
 
52
 
53
- # 從Yahoo News抓取新聞的函數
54
- def fetch_yahoo_news(url):
55
- response = requests.get(url)
56
- web_content = response.content
57
- soup = BeautifulSoup(web_content, 'html.parser')
58
- title = soup.find('h1').text
59
- content = soup.find('div', {'class': 'caas-body'}).text
60
- return title, content
61
 
62
  # 建立Streamlit網頁應用程式
63
- st.title("中文關鍵詞提取工具")
64
 
65
- # 選擇輸入方式
66
- input_method = st.radio("選擇輸入方式", ("手動輸入", "從Yahoo News抓取"))
67
 
68
- if input_method == "手動輸入":
69
- doc = st.text_area("請輸入文章:")
70
- else:
71
- url = st.text_input("請輸入Yahoo News文章URL:")
72
  if url:
73
- title, doc = fetch_yahoo_news(url)
74
- st.write(f"標題: {title}")
75
- st.write("文章內容:")
76
- st.write(doc)
77
-
78
- if st.button("提取關鍵詞"):
79
- if doc:
80
- keywords = extract_keywords(doc)
 
 
 
 
81
  st.write("關鍵詞提取結果:")
82
  for keyword in keywords:
83
  st.write(f"{keyword[0]}: {keyword[1]:.4f}")
84
 
85
  plot_keywords(keywords, "關鍵詞提取結果")
86
-
 
87
  kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
88
- keywords_multilingual = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer)
89
  st.write("多語言模型關鍵詞提取結果:")
90
  for keyword in keywords_multilingual:
91
  st.write(f"{keyword[0]}: {keyword[1]:.4f}")
92
 
93
  plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
94
  else:
95
- st.write("請輸入文章內容或提供有效的Yahoo News URL以進行關鍵詞提取。")
 
1
+ # -*- coding: utf-8 -*-
2
+ """keyword_extraction"""
3
+
4
  import requests
5
+ from bs4 import BeautifulSoup
6
+ import pandas as pd
7
  import jieba
8
  from keybert import KeyBERT
9
  from sklearn.feature_extraction.text import CountVectorizer
10
  import streamlit as st
11
  import matplotlib.pyplot as plt
12
  from matplotlib.font_manager import FontProperties
 
 
13
 
14
  # 下載字體
15
  def download_font(url, save_path):
 
27
  # 設置字體
28
  font_prop = FontProperties(fname=font_path)
29
 
30
+ # 抓取Yahoo新聞標題和內容
31
+ def fetch_yahoo_news(url):
32
+ response = requests.get(url)
33
+ web_content = response.content
34
+ soup = BeautifulSoup(web_content, 'html.parser')
35
+ title = soup.find('h1').text
36
+ content = soup.find('article').text
37
+ return title, content
38
+
39
+ # 斷詞函數
40
  def jieba_tokenizer(text):
41
  return jieba.lcut(text)
42
 
 
53
  def plot_keywords(keywords, title):
54
  words = [kw[0] for kw in keywords]
55
  scores = [kw[1] for kw in keywords]
56
+
57
  plt.figure(figsize=(10, 6))
58
+ bars = plt.barh(words, scores, color='skyblue', edgecolor='black', linewidth=1.2)
59
+ plt.xlabel('分數', fontproperties=font_prop, fontsize=14)
60
+ plt.title(title, fontproperties=font_prop, fontsize=16)
61
+ plt.gca().invert_yaxis() # 反轉Y軸,使得分數最高的關鍵詞在最上面
62
+ plt.xticks(fontproperties=font_prop, fontsize=12)
63
+ plt.yticks(fontproperties=font_prop, fontsize=12)
64
+ plt.grid(axis='x', linestyle='--', alpha=0.7)
65
+
66
+ # 添加分數標籤
67
+ for bar in bars:
68
+ plt.gca().text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
69
+ f'{bar.get_width():.4f}', va='center', ha='left', fontsize=12, fontproperties=font_prop)
70
 
71
+ st.pyplot(plt)
 
 
 
 
 
 
 
72
 
73
  # 建立Streamlit網頁應用程式
74
+ st.title("🤙🤙🤙YAHOO新聞關鍵詞提取工具👂👂")
75
 
76
+ # 抓取Yahoo新聞的URL輸入
77
+ url = st.text_input("輸入Yahoo新聞的URL:")
78
 
79
+ if st.button("抓取並提取關鍵詞"):
 
 
 
80
  if url:
81
+ title, content = fetch_yahoo_news(url)
82
+ st.write("新聞標題:", title)
83
+ st.write("新聞內容", content)
84
+
85
+ # 將內容轉為DataFrame
86
+ data = {'Title': [title], 'Content': [content]}
87
+ df = pd.DataFrame(data)
88
+ st.write("新聞內容的DataFrame:")
89
+ st.write(df)
90
+
91
+ # 提取關鍵詞
92
+ keywords = extract_keywords(content)
93
  st.write("關鍵詞提取結果:")
94
  for keyword in keywords:
95
  st.write(f"{keyword[0]}: {keyword[1]:.4f}")
96
 
97
  plot_keywords(keywords, "關鍵詞提取結果")
98
+
99
+ # 使用另一個模型進行關鍵詞提取
100
  kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
101
+ keywords_multilingual = kw_model_multilingual.extract_keywords(content, vectorizer=vectorizer)
102
  st.write("多語言模型關鍵詞提取結果:")
103
  for keyword in keywords_multilingual:
104
  st.write(f"{keyword[0]}: {keyword[1]:.4f}")
105
 
106
  plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
107
  else:
108
+ st.write("請輸入有效的Yahoo新聞URL。")