JERNGOC commited on
Commit
bba69fc
Β·
verified Β·
1 Parent(s): 8a0e99b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -32
app.py CHANGED
@@ -1,15 +1,13 @@
1
  import requests
2
  from bs4 import BeautifulSoup
 
3
  import jieba
4
  from keybert import KeyBERT
5
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
6
- import matplotlib.pyplot as plt
7
  import streamlit as st
 
8
  from wordcloud import WordCloud
9
- import os
10
-
11
- # Use a default system font
12
- font_path = '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf'
13
 
14
  # Tokenizer
15
  def jieba_tokenizer(text):
@@ -31,18 +29,20 @@ def extract_multilingual_keywords(doc, diversity):
31
  return keywords
32
 
33
  # Plot keywords
34
- def plot_keywords(keywords, title, filename):
35
- words = [kw[0] for kw in keywords]
36
- scores = [kw[1] for kw in keywords]
37
  plt.figure(figsize=(10, 6))
38
- plt.barh(words, scores, color='#1f77b4')
39
  plt.xlabel('Score')
40
  plt.title(title)
41
  plt.gca().invert_yaxis()
42
  plt.tight_layout()
43
- plt.savefig(filename)
 
 
 
44
  plt.close()
45
- return filename
46
 
47
  # Generate word cloud
48
  def generate_word_cloud(text):
@@ -50,17 +50,18 @@ def generate_word_cloud(text):
50
  tfidf_matrix = tfidf_vectorizer.fit_transform([text])
51
  tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
52
 
53
- wordcloud = WordCloud(font_path=font_path, background_color='white', width=800, height=400)
54
- wordcloud.generate_from_frequencies(tfidf_scores)
55
 
56
  plt.figure(figsize=(10, 5))
57
  plt.imshow(wordcloud, interpolation='bilinear')
58
  plt.axis('off')
59
  plt.tight_layout(pad=0)
60
- plt.savefig('wordcloud.png', dpi=300, bbox_inches='tight')
61
- plt.close()
62
 
63
- return 'wordcloud.png'
 
 
 
 
64
 
65
  # Function to scrape content and extract keywords
66
  def scrape_and_extract(url, diversity):
@@ -75,11 +76,11 @@ def scrape_and_extract(url, diversity):
75
  keywords = extract_keywords(content, diversity)
76
  keywords_multilingual = extract_multilingual_keywords(content, diversity)
77
 
78
- keyword_plot_path = plot_keywords(keywords, "Keyword Extraction Results", "keywords_plot.png")
79
- keyword_plot_multilingual_path = plot_keywords(keywords_multilingual, "Multilingual Keyword Extraction Results", "keywords_multilingual_plot.png")
80
- wordcloud_path = generate_word_cloud(content)
81
 
82
- return title, content, keywords, keyword_plot_path, keywords_multilingual, keyword_plot_multilingual_path, wordcloud_path
83
 
84
  # Streamlit Interface
85
  st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="πŸ”")
@@ -93,7 +94,7 @@ diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)"
93
  if st.button("Extract Keywords"):
94
  if url:
95
  try:
96
- title, content, keywords, keyword_plot_path, keywords_multilingual, keyword_plot_multilingual_path, wordcloud_path = scrape_and_extract(url, diversity)
97
 
98
  st.subheader("πŸ“„ Article Title")
99
  st.write(title)
@@ -102,24 +103,21 @@ if st.button("Extract Keywords"):
102
  st.write(content)
103
 
104
  st.subheader("πŸ”‘ Extracted Keywords")
105
- keywords_str = '\n'.join([f"{kw[0]}: {kw[1]:.4f}" for kw in keywords])
106
- st.text(keywords_str)
107
 
108
  st.subheader("πŸ“Š Keywords Bar Chart")
109
- st.image(keyword_plot_path)
110
 
111
  st.subheader("πŸ”‘ Multilingual Extracted Keywords")
112
- keywords_multilingual_str = '\n'.join([f"{kw[0]}: {kw[1]:.4f}" for kw in keywords_multilingual])
113
- st.text(keywords_multilingual_str)
114
 
115
  st.subheader("πŸ“Š Multilingual Keywords Bar Chart")
116
- st.image(keyword_plot_multilingual_path)
117
 
118
- if os.path.exists(wordcloud_path):
119
- st.subheader("☁️ Word Cloud")
120
- st.image(wordcloud_path)
121
- else:
122
- st.warning("Unable to generate word cloud.")
123
  except Exception as e:
124
  st.error(f"An error occurred: {str(e)}")
125
  else:
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ import pandas as pd
4
  import jieba
5
  from keybert import KeyBERT
6
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
7
  import streamlit as st
8
+ import matplotlib.pyplot as plt
9
  from wordcloud import WordCloud
10
+ import io
 
 
 
11
 
12
  # Tokenizer
13
  def jieba_tokenizer(text):
 
29
  return keywords
30
 
31
  # Plot keywords
32
+ def plot_keywords(keywords, title):
33
+ df = pd.DataFrame(keywords, columns=['Word', 'Score'])
 
34
  plt.figure(figsize=(10, 6))
35
+ plt.barh(df['Word'], df['Score'], color='#1f77b4')
36
  plt.xlabel('Score')
37
  plt.title(title)
38
  plt.gca().invert_yaxis()
39
  plt.tight_layout()
40
+
41
+ img = io.BytesIO()
42
+ plt.savefig(img, format='png')
43
+ img.seek(0)
44
  plt.close()
45
+ return img
46
 
47
  # Generate word cloud
48
  def generate_word_cloud(text):
 
50
  tfidf_matrix = tfidf_vectorizer.fit_transform([text])
51
  tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
52
 
53
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(tfidf_scores)
 
54
 
55
  plt.figure(figsize=(10, 5))
56
  plt.imshow(wordcloud, interpolation='bilinear')
57
  plt.axis('off')
58
  plt.tight_layout(pad=0)
 
 
59
 
60
+ img = io.BytesIO()
61
+ plt.savefig(img, format='png')
62
+ img.seek(0)
63
+ plt.close()
64
+ return img
65
 
66
  # Function to scrape content and extract keywords
67
  def scrape_and_extract(url, diversity):
 
76
  keywords = extract_keywords(content, diversity)
77
  keywords_multilingual = extract_multilingual_keywords(content, diversity)
78
 
79
+ keyword_plot = plot_keywords(keywords, "Keyword Extraction Results")
80
+ keyword_plot_multilingual = plot_keywords(keywords_multilingual, "Multilingual Keyword Extraction Results")
81
+ wordcloud = generate_word_cloud(content)
82
 
83
+ return title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud
84
 
85
  # Streamlit Interface
86
  st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="πŸ”")
 
94
  if st.button("Extract Keywords"):
95
  if url:
96
  try:
97
+ title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud = scrape_and_extract(url, diversity)
98
 
99
  st.subheader("πŸ“„ Article Title")
100
  st.write(title)
 
103
  st.write(content)
104
 
105
  st.subheader("πŸ”‘ Extracted Keywords")
106
+ df_keywords = pd.DataFrame(keywords, columns=['Word', 'Score'])
107
+ st.dataframe(df_keywords)
108
 
109
  st.subheader("πŸ“Š Keywords Bar Chart")
110
+ st.image(keyword_plot)
111
 
112
  st.subheader("πŸ”‘ Multilingual Extracted Keywords")
113
+ df_keywords_multilingual = pd.DataFrame(keywords_multilingual, columns=['Word', 'Score'])
114
+ st.dataframe(df_keywords_multilingual)
115
 
116
  st.subheader("πŸ“Š Multilingual Keywords Bar Chart")
117
+ st.image(keyword_plot_multilingual)
118
 
119
+ st.subheader("☁️ Word Cloud")
120
+ st.image(wordcloud)
 
 
 
121
  except Exception as e:
122
  st.error(f"An error occurred: {str(e)}")
123
  else: