JERNGOC commited on
Commit
c08903c
Β·
verified Β·
1 Parent(s): a938177

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -49
app.py CHANGED
@@ -4,21 +4,12 @@ import jieba
4
  from keybert import KeyBERT
5
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
6
  import matplotlib.pyplot as plt
7
- from matplotlib.font_manager import FontProperties
8
  import streamlit as st
9
  from wordcloud import WordCloud
 
10
 
11
- # Download font
12
- def download_font(url, save_path):
13
- response = requests.get(url)
14
- with open(save_path, 'wb') as f:
15
- f.write(response.content)
16
-
17
- # Font URL and path
18
- font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
19
- font_path = 'TaipeiSansTCBeta-Regular.ttf'
20
- download_font(font_url, font_path)
21
- font_prop = FontProperties(fname=font_path)
22
 
23
  # Tokenizer
24
  def jieba_tokenizer(text):
@@ -45,13 +36,13 @@ def plot_keywords(keywords, title, filename):
45
  scores = [kw[1] for kw in keywords]
46
  plt.figure(figsize=(10, 6))
47
  plt.barh(words, scores, color='#1f77b4')
48
- plt.xlabel('Score', fontproperties=font_prop)
49
- plt.title(title, fontproperties=font_prop)
50
  plt.gca().invert_yaxis()
51
- plt.xticks(fontproperties=font_prop)
52
- plt.yticks(fontproperties=font_prop)
53
- plt.savefig(f'/tmp/{filename}.png')
54
- return f'/tmp/{filename}.png'
55
 
56
  # Generate word cloud
57
  def generate_word_cloud(text):
@@ -59,9 +50,17 @@ def generate_word_cloud(text):
59
  tfidf_matrix = tfidf_vectorizer.fit_transform([text])
60
  tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
61
 
62
- wordcloud = WordCloud(font_path=font_path, background_color='white').generate_from_frequencies(tfidf_scores)
63
- wordcloud.to_file('/tmp/wordcloud.png')
64
- return '/tmp/wordcloud.png'
 
 
 
 
 
 
 
 
65
 
66
  # Function to scrape content and extract keywords
67
  def scrape_and_extract(url, diversity):
@@ -76,8 +75,8 @@ def scrape_and_extract(url, diversity):
76
  keywords = extract_keywords(content, diversity)
77
  keywords_multilingual = extract_multilingual_keywords(content, diversity)
78
 
79
- keyword_plot_path = plot_keywords(keywords, "Keyword Extraction Results", "keywords_plot")
80
- keyword_plot_multilingual_path = plot_keywords(keywords_multilingual, "Multilingual Keyword Extraction Results", "keywords_multilingual_plot")
81
  wordcloud_path = generate_word_cloud(content)
82
 
83
  return title, content, keywords, keyword_plot_path, keywords_multilingual, keyword_plot_multilingual_path, wordcloud_path
@@ -93,30 +92,35 @@ diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)"
93
 
94
  if st.button("Extract Keywords"):
95
  if url:
96
- title, content, keywords, keyword_plot_path, keywords_multilingual, keyword_plot_multilingual_path, wordcloud_path = scrape_and_extract(url, diversity)
97
-
98
- st.subheader("πŸ“„ Article Title")
99
- st.write(title)
100
-
101
- st.subheader("πŸ“ Article Content")
102
- st.write(content)
103
-
104
- st.subheader("πŸ”‘ Extracted Keywords")
105
- keywords_str = '\n'.join([f"{kw[0]}: {kw[1]:.4f}" for kw in keywords])
106
- st.text(keywords_str)
107
-
108
- st.subheader("πŸ“Š Keywords Bar Chart")
109
- st.image(keyword_plot_path)
110
-
111
- st.subheader("πŸ”‘ Multilingual Extracted Keywords")
112
- keywords_multilingual_str = '\n'.join([f"{kw[0]}: {kw[1]:.4f}" for kw in keywords_multilingual])
113
- st.text(keywords_multilingual_str)
114
-
115
- st.subheader("πŸ“Š Multilingual Keywords Bar Chart")
116
- st.image(keyword_plot_multilingual_path)
117
-
118
- st.subheader("☁️ Word Cloud")
119
- st.image(wordcloud_path)
 
 
 
 
 
 
120
  else:
121
- st.warning("Please enter a URL to extract keywords.")
122
-
 
4
  from keybert import KeyBERT
5
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
6
  import matplotlib.pyplot as plt
 
7
  import streamlit as st
8
  from wordcloud import WordCloud
9
+ import os
10
 
11
+ # Use a default system font
12
+ font_path = '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf'
 
 
 
 
 
 
 
 
 
13
 
14
  # Tokenizer
15
  def jieba_tokenizer(text):
 
36
  scores = [kw[1] for kw in keywords]
37
  plt.figure(figsize=(10, 6))
38
  plt.barh(words, scores, color='#1f77b4')
39
+ plt.xlabel('Score')
40
+ plt.title(title)
41
  plt.gca().invert_yaxis()
42
+ plt.tight_layout()
43
+ plt.savefig(filename)
44
+ plt.close()
45
+ return filename
46
 
47
  # Generate word cloud
48
  def generate_word_cloud(text):
 
50
  tfidf_matrix = tfidf_vectorizer.fit_transform([text])
51
  tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
52
 
53
+ wordcloud = WordCloud(font_path=font_path, background_color='white', width=800, height=400)
54
+ wordcloud.generate_from_frequencies(tfidf_scores)
55
+
56
+ plt.figure(figsize=(10, 5))
57
+ plt.imshow(wordcloud, interpolation='bilinear')
58
+ plt.axis('off')
59
+ plt.tight_layout(pad=0)
60
+ plt.savefig('wordcloud.png', dpi=300, bbox_inches='tight')
61
+ plt.close()
62
+
63
+ return 'wordcloud.png'
64
 
65
  # Function to scrape content and extract keywords
66
  def scrape_and_extract(url, diversity):
 
75
  keywords = extract_keywords(content, diversity)
76
  keywords_multilingual = extract_multilingual_keywords(content, diversity)
77
 
78
+ keyword_plot_path = plot_keywords(keywords, "Keyword Extraction Results", "keywords_plot.png")
79
+ keyword_plot_multilingual_path = plot_keywords(keywords_multilingual, "Multilingual Keyword Extraction Results", "keywords_multilingual_plot.png")
80
  wordcloud_path = generate_word_cloud(content)
81
 
82
  return title, content, keywords, keyword_plot_path, keywords_multilingual, keyword_plot_multilingual_path, wordcloud_path
 
92
 
93
  if st.button("Extract Keywords"):
94
  if url:
95
+ try:
96
+ title, content, keywords, keyword_plot_path, keywords_multilingual, keyword_plot_multilingual_path, wordcloud_path = scrape_and_extract(url, diversity)
97
+
98
+ st.subheader("πŸ“„ Article Title")
99
+ st.write(title)
100
+
101
+ st.subheader("πŸ“ Article Content")
102
+ st.write(content)
103
+
104
+ st.subheader("πŸ”‘ Extracted Keywords")
105
+ keywords_str = '\n'.join([f"{kw[0]}: {kw[1]:.4f}" for kw in keywords])
106
+ st.text(keywords_str)
107
+
108
+ st.subheader("πŸ“Š Keywords Bar Chart")
109
+ st.image(keyword_plot_path)
110
+
111
+ st.subheader("πŸ”‘ Multilingual Extracted Keywords")
112
+ keywords_multilingual_str = '\n'.join([f"{kw[0]}: {kw[1]:.4f}" for kw in keywords_multilingual])
113
+ st.text(keywords_multilingual_str)
114
+
115
+ st.subheader("πŸ“Š Multilingual Keywords Bar Chart")
116
+ st.image(keyword_plot_multilingual_path)
117
+
118
+ if os.path.exists(wordcloud_path):
119
+ st.subheader("☁️ Word Cloud")
120
+ st.image(wordcloud_path)
121
+ else:
122
+ st.warning("Unable to generate word cloud.")
123
+ except Exception as e:
124
+ st.error(f"An error occurred: {str(e)}")
125
  else:
126
+ st.warning("Please enter a URL to extract keywords.")