Spaces:

JERNGOC
/

crawler_NLP

Sleeping

App Files Files Community

JERNGOC commited on Aug 5, 2024

Commit

b49045a

verified ·

1 Parent(s): 06ecdab

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -7

app.py CHANGED Viewed

@@ -2,11 +2,12 @@ import requests
 from bs4 import BeautifulSoup
 import jieba
 from keybert import KeyBERT
-from sklearn.feature_extraction.text import CountVectorizer
 import matplotlib.pyplot as plt
 from matplotlib.font_manager import FontProperties
 import streamlit as st
 import pandas as pd
 # Download font
 def download_font(url, save_path):
@@ -47,6 +48,16 @@ def plot_keywords(keywords, title):
     plt.savefig('/tmp/keywords_plot.png')
     return '/tmp/keywords_plot.png'
 # Function to scrape content and extract keywords
 def scrape_and_extract(url, diversity):
     response = requests.get(url)
@@ -57,21 +68,22 @@ def scrape_and_extract(url, diversity):
     paragraphs = content_div.find_all('p')
     content = '\n'.join([p.text.strip() for p in paragraphs])
     keywords = extract_keywords(content, diversity)
-    plot_path = plot_keywords(keywords, "Keyword Extraction Results")
-    return title, content, keywords, plot_path
 # Streamlit Interface
 st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="🔍")
 st.title("🔍 Professional Keyword Extraction Tool")
-st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores.")
 url = st.text_input("🌐 Enter the article URL here:")
 diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
 if st.button("Extract Keywords"):
     if url:
-        title, content, keywords, plot_path = scrape_and_extract(url, diversity)
         st.subheader("📄 Article Title")
         st.write(title)
@@ -84,7 +96,9 @@ if st.button("Extract Keywords"):
         st.text(keywords_str)
         st.subheader("📊 Keywords Bar Chart")
-        st.image(plot_path)
     else:
         st.warning("Please enter a URL to extract keywords.")

 from bs4 import BeautifulSoup
 import jieba
 from keybert import KeyBERT
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 import matplotlib.pyplot as plt
 from matplotlib.font_manager import FontProperties
 import streamlit as st
 import pandas as pd
+from wordcloud import WordCloud
 # Download font
 def download_font(url, save_path):
     plt.savefig('/tmp/keywords_plot.png')
     return '/tmp/keywords_plot.png'
+# Generate word cloud
+def generate_word_cloud(text):
+    tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
+    tfidf_matrix = tfidf_vectorizer.fit_transform([text])
+    tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
+    wordcloud = WordCloud(font_path=font_path, background_color='white').generate_from_frequencies(tfidf_scores)
+    wordcloud.to_file('/tmp/wordcloud.png')
+    return '/tmp/wordcloud.png'
 # Function to scrape content and extract keywords
 def scrape_and_extract(url, diversity):
     response = requests.get(url)
     paragraphs = content_div.find_all('p')
     content = '\n'.join([p.text.strip() for p in paragraphs])
     keywords = extract_keywords(content, diversity)
+    keyword_plot_path = plot_keywords(keywords, "Keyword Extraction Results")
+    wordcloud_path = generate_word_cloud(content)
+    return title, content, keywords, keyword_plot_path, wordcloud_path
 # Streamlit Interface
 st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="🔍")
 st.title("🔍 Professional Keyword Extraction Tool")
+st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores. Additionally, a word cloud is generated based on TF-IDF scores.")
 url = st.text_input("🌐 Enter the article URL here:")
 diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
 if st.button("Extract Keywords"):
     if url:
+        title, content, keywords, keyword_plot_path, wordcloud_path = scrape_and_extract(url, diversity)
         st.subheader("📄 Article Title")
         st.write(title)
         st.text(keywords_str)
         st.subheader("📊 Keywords Bar Chart")
+        st.image(keyword_plot_path)
+        st.subheader("☁️ Word Cloud")
+        st.image(wordcloud_path)
     else:
         st.warning("Please enter a URL to extract keywords.")