JERNGOC commited on
Commit
6558ced
Β·
verified Β·
1 Parent(s): 2bcb67d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -8
app.py CHANGED
@@ -25,17 +25,23 @@ font_prop = FontProperties(fname=font_path)
25
  def jieba_tokenizer(text):
26
  return jieba.lcut(text)
27
 
28
- # Initialize KeyBERT model
29
  vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
30
  kw_model = KeyBERT()
 
31
 
32
  # Extract keywords using MMR
33
  def extract_keywords(doc, diversity):
34
  keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
35
  return keywords
36
 
 
 
 
 
 
37
  # Plot keywords
38
- def plot_keywords(keywords, title):
39
  words = [kw[0] for kw in keywords]
40
  scores = [kw[1] for kw in keywords]
41
  plt.figure(figsize=(10, 6))
@@ -45,8 +51,8 @@ def plot_keywords(keywords, title):
45
  plt.gca().invert_yaxis()
46
  plt.xticks(fontproperties=font_prop)
47
  plt.yticks(fontproperties=font_prop)
48
- plt.savefig('/tmp/keywords_plot.png')
49
- return '/tmp/keywords_plot.png'
50
 
51
  # Generate word cloud
52
  def generate_word_cloud(text):
@@ -67,23 +73,28 @@ def scrape_and_extract(url, diversity):
67
  content_div = soup.find('div', {'class': 'caas-body'})
68
  paragraphs = content_div.find_all('p')
69
  content = '\n'.join([p.text.strip() for p in paragraphs])
 
70
  keywords = extract_keywords(content, diversity)
71
- keyword_plot_path = plot_keywords(keywords, "Keyword Extraction Results")
 
 
 
72
  wordcloud_path = generate_word_cloud(content)
73
- return title, content, keywords, keyword_plot_path, wordcloud_path
 
74
 
75
  # Streamlit Interface
76
  st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="πŸ”")
77
 
78
  st.title("πŸ” Professional Keyword Extraction Tool")
79
- st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores. Additionally, a word cloud is generated based on TF-IDF scores.")
80
 
81
  url = st.text_input("🌐 Enter the article URL here:")
82
  diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
83
 
84
  if st.button("Extract Keywords"):
85
  if url:
86
- title, content, keywords, keyword_plot_path, wordcloud_path = scrape_and_extract(url, diversity)
87
 
88
  st.subheader("πŸ“„ Article Title")
89
  st.write(title)
@@ -98,6 +109,13 @@ if st.button("Extract Keywords"):
98
  st.subheader("πŸ“Š Keywords Bar Chart")
99
  st.image(keyword_plot_path)
100
 
 
 
 
 
 
 
 
101
  st.subheader("☁️ Word Cloud")
102
  st.image(wordcloud_path)
103
  else:
 
25
  def jieba_tokenizer(text):
26
  return jieba.lcut(text)
27
 
28
+ # Initialize KeyBERT models
29
  vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
30
  kw_model = KeyBERT()
31
+ kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
32
 
33
  # Extract keywords using MMR
34
  def extract_keywords(doc, diversity):
35
  keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
36
  return keywords
37
 
38
+ # Extract multilingual keywords
39
+ def extract_multilingual_keywords(doc, diversity):
40
+ keywords = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
41
+ return keywords
42
+
43
  # Plot keywords
44
+ def plot_keywords(keywords, title, filename):
45
  words = [kw[0] for kw in keywords]
46
  scores = [kw[1] for kw in keywords]
47
  plt.figure(figsize=(10, 6))
 
51
  plt.gca().invert_yaxis()
52
  plt.xticks(fontproperties=font_prop)
53
  plt.yticks(fontproperties=font_prop)
54
+ plt.savefig(f'/tmp/{filename}.png')
55
+ return f'/tmp/{filename}.png'
56
 
57
  # Generate word cloud
58
  def generate_word_cloud(text):
 
73
  content_div = soup.find('div', {'class': 'caas-body'})
74
  paragraphs = content_div.find_all('p')
75
  content = '\n'.join([p.text.strip() for p in paragraphs])
76
+
77
  keywords = extract_keywords(content, diversity)
78
+ keywords_multilingual = extract_multilingual_keywords(content, diversity)
79
+
80
+ keyword_plot_path = plot_keywords(keywords, "Keyword Extraction Results", "keywords_plot")
81
+ keyword_plot_multilingual_path = plot_keywords(keywords_multilingual, "Multilingual Keyword Extraction Results", "keywords_multilingual_plot")
82
  wordcloud_path = generate_word_cloud(content)
83
+
84
+ return title, content, keywords, keyword_plot_path, keywords_multilingual, keyword_plot_multilingual_path, wordcloud_path
85
 
86
  # Streamlit Interface
87
  st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="πŸ”")
88
 
89
  st.title("πŸ” Professional Keyword Extraction Tool")
90
+ st.write("Extracts keywords from a given URL and displays two bar charts of the keywords with their respective scores. Additionally, a word cloud is generated based on TF-IDF scores.")
91
 
92
  url = st.text_input("🌐 Enter the article URL here:")
93
  diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
94
 
95
  if st.button("Extract Keywords"):
96
  if url:
97
+ title, content, keywords, keyword_plot_path, keywords_multilingual, keyword_plot_multilingual_path, wordcloud_path = scrape_and_extract(url, diversity)
98
 
99
  st.subheader("πŸ“„ Article Title")
100
  st.write(title)
 
109
  st.subheader("πŸ“Š Keywords Bar Chart")
110
  st.image(keyword_plot_path)
111
 
112
+ st.subheader("πŸ”‘ Multilingual Extracted Keywords")
113
+ keywords_multilingual_str = '\n'.join([f"{kw[0]}: {kw[1]:.4f}" for kw in keywords_multilingual])
114
+ st.text(keywords_multilingual_str)
115
+
116
+ st.subheader("πŸ“Š Multilingual Keywords Bar Chart")
117
+ st.image(keyword_plot_multilingual_path)
118
+
119
  st.subheader("☁️ Word Cloud")
120
  st.image(wordcloud_path)
121
  else: