JERNGOC commited on
Commit
d2b12b1
·
verified ·
1 Parent(s): d9ec1cb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """keyword_extraction"""
3
+ import requests
4
+ import jieba
5
+ from keybert import KeyBERT
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
+ import streamlit as st
8
+ import matplotlib.pyplot as plt
9
+ from matplotlib.font_manager import FontProperties
10
+
11
+ # Download font
12
+ @st.cache_data
13
+ def download_font(url, save_path):
14
+ response = requests.get(url)
15
+ with open(save_path, 'wb') as f:
16
+ f.write(response.content)
17
+
18
+ # Font URL and save path
19
+ font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
20
+ font_path = 'TaipeiSansTCBeta-Regular.ttf'
21
+
22
+ # Download font
23
+ download_font(font_url, font_path)
24
+
25
+ # Set font
26
+ font_prop = FontProperties(fname=font_path)
27
+
28
+ # Define tokenizer function
29
+ def jieba_tokenizer(text):
30
+ return jieba.lcut(text)
31
+
32
+ # Initialize CountVectorizer and KeyBERT model
33
+ @st.cache_resource
34
+ def load_models():
35
+ vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
36
+ kw_model = KeyBERT()
37
+ kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
38
+ return vectorizer, kw_model, kw_model_multilingual
39
+
40
+ vectorizer, kw_model, kw_model_multilingual = load_models()
41
+
42
+ # Extract keywords function
43
+ def extract_keywords(doc, model):
44
+ keywords = model.extract_keywords(doc, vectorizer=vectorizer)
45
+ return keywords
46
+
47
+ # Plot keywords function
48
+ def plot_keywords(keywords, title):
49
+ words = [kw[0] for kw in keywords]
50
+ scores = [kw[1] for kw in keywords]
51
+ fig, ax = plt.subplots(figsize=(10, 6))
52
+ ax.barh(words, scores, color='skyblue')
53
+ ax.set_xlabel('分數', fontproperties=font_prop)
54
+ ax.set_title(title, fontproperties=font_prop)
55
+ ax.invert_yaxis() # Invert Y-axis so that the highest scoring keyword is on top
56
+ ax.tick_params(axis='both', which='major', labelsize=10)
57
+ plt.xticks(fontproperties=font_prop)
58
+ plt.yticks(fontproperties=font_prop)
59
+ st.pyplot(fig)
60
+
61
+ # Streamlit app
62
+ st.title("中文關鍵詞提取工具")
63
+
64
+ doc = st.text_area("請輸入文章:")
65
+
66
+ if st.button("提取關鍵詞"):
67
+ if doc:
68
+ keywords = extract_keywords(doc, kw_model)
69
+ st.write("關鍵詞提取結果:")
70
+ for keyword in keywords:
71
+ st.write(f"{keyword[0]}: {keyword[1]:.4f}")
72
+
73
+ plot_keywords(keywords, "關鍵詞提取結果")
74
+
75
+ keywords_multilingual = extract_keywords(doc, kw_model_multilingual)
76
+ st.write("多語言模型關鍵詞提取結果:")
77
+ for keyword in keywords_multilingual:
78
+ st.write(f"{keyword[0]}: {keyword[1]:.4f}")
79
+
80
+ plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
81
+ else:
82
+ st.write("請輸入文章內容以進行關鍵詞提取。")