JERNGOC commited on
Commit
b49045a
Β·
verified Β·
1 Parent(s): 06ecdab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -7
app.py CHANGED
@@ -2,11 +2,12 @@ import requests
2
  from bs4 import BeautifulSoup
3
  import jieba
4
  from keybert import KeyBERT
5
- from sklearn.feature_extraction.text import CountVectorizer
6
  import matplotlib.pyplot as plt
7
  from matplotlib.font_manager import FontProperties
8
  import streamlit as st
9
  import pandas as pd
 
10
 
11
  # Download font
12
  def download_font(url, save_path):
@@ -47,6 +48,16 @@ def plot_keywords(keywords, title):
47
  plt.savefig('/tmp/keywords_plot.png')
48
  return '/tmp/keywords_plot.png'
49
 
 
 
 
 
 
 
 
 
 
 
50
  # Function to scrape content and extract keywords
51
  def scrape_and_extract(url, diversity):
52
  response = requests.get(url)
@@ -57,21 +68,22 @@ def scrape_and_extract(url, diversity):
57
  paragraphs = content_div.find_all('p')
58
  content = '\n'.join([p.text.strip() for p in paragraphs])
59
  keywords = extract_keywords(content, diversity)
60
- plot_path = plot_keywords(keywords, "Keyword Extraction Results")
61
- return title, content, keywords, plot_path
 
62
 
63
  # Streamlit Interface
64
  st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="πŸ”")
65
 
66
  st.title("πŸ” Professional Keyword Extraction Tool")
67
- st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores.")
68
 
69
  url = st.text_input("🌐 Enter the article URL here:")
70
  diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
71
 
72
  if st.button("Extract Keywords"):
73
  if url:
74
- title, content, keywords, plot_path = scrape_and_extract(url, diversity)
75
 
76
  st.subheader("πŸ“„ Article Title")
77
  st.write(title)
@@ -84,7 +96,9 @@ if st.button("Extract Keywords"):
84
  st.text(keywords_str)
85
 
86
  st.subheader("πŸ“Š Keywords Bar Chart")
87
- st.image(plot_path)
 
 
 
88
  else:
89
  st.warning("Please enter a URL to extract keywords.")
90
-
 
2
  from bs4 import BeautifulSoup
3
  import jieba
4
  from keybert import KeyBERT
5
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
6
  import matplotlib.pyplot as plt
7
  from matplotlib.font_manager import FontProperties
8
  import streamlit as st
9
  import pandas as pd
10
+ from wordcloud import WordCloud
11
 
12
  # Download font
13
  def download_font(url, save_path):
 
48
  plt.savefig('/tmp/keywords_plot.png')
49
  return '/tmp/keywords_plot.png'
50
 
51
+ # Generate word cloud
52
+ def generate_word_cloud(text):
53
+ tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
54
+ tfidf_matrix = tfidf_vectorizer.fit_transform([text])
55
+ tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
56
+
57
+ wordcloud = WordCloud(font_path=font_path, background_color='white').generate_from_frequencies(tfidf_scores)
58
+ wordcloud.to_file('/tmp/wordcloud.png')
59
+ return '/tmp/wordcloud.png'
60
+
61
  # Function to scrape content and extract keywords
62
  def scrape_and_extract(url, diversity):
63
  response = requests.get(url)
 
68
  paragraphs = content_div.find_all('p')
69
  content = '\n'.join([p.text.strip() for p in paragraphs])
70
  keywords = extract_keywords(content, diversity)
71
+ keyword_plot_path = plot_keywords(keywords, "Keyword Extraction Results")
72
+ wordcloud_path = generate_word_cloud(content)
73
+ return title, content, keywords, keyword_plot_path, wordcloud_path
74
 
75
  # Streamlit Interface
76
  st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="πŸ”")
77
 
78
  st.title("πŸ” Professional Keyword Extraction Tool")
79
+ st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores. Additionally, a word cloud is generated based on TF-IDF scores.")
80
 
81
  url = st.text_input("🌐 Enter the article URL here:")
82
  diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
83
 
84
  if st.button("Extract Keywords"):
85
  if url:
86
+ title, content, keywords, keyword_plot_path, wordcloud_path = scrape_and_extract(url, diversity)
87
 
88
  st.subheader("πŸ“„ Article Title")
89
  st.write(title)
 
96
  st.text(keywords_str)
97
 
98
  st.subheader("πŸ“Š Keywords Bar Chart")
99
+ st.image(keyword_plot_path)
100
+
101
+ st.subheader("☁️ Word Cloud")
102
+ st.image(wordcloud_path)
103
  else:
104
  st.warning("Please enter a URL to extract keywords.")