Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,11 +2,12 @@ import requests
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
import jieba
|
| 4 |
from keybert import KeyBERT
|
| 5 |
-
from sklearn.feature_extraction.text import CountVectorizer
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
from matplotlib.font_manager import FontProperties
|
| 8 |
import streamlit as st
|
| 9 |
import pandas as pd
|
|
|
|
| 10 |
|
| 11 |
# Download font
|
| 12 |
def download_font(url, save_path):
|
|
@@ -47,6 +48,16 @@ def plot_keywords(keywords, title):
|
|
| 47 |
plt.savefig('/tmp/keywords_plot.png')
|
| 48 |
return '/tmp/keywords_plot.png'
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
# Function to scrape content and extract keywords
|
| 51 |
def scrape_and_extract(url, diversity):
|
| 52 |
response = requests.get(url)
|
|
@@ -57,21 +68,22 @@ def scrape_and_extract(url, diversity):
|
|
| 57 |
paragraphs = content_div.find_all('p')
|
| 58 |
content = '\n'.join([p.text.strip() for p in paragraphs])
|
| 59 |
keywords = extract_keywords(content, diversity)
|
| 60 |
-
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
# Streamlit Interface
|
| 64 |
st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="π")
|
| 65 |
|
| 66 |
st.title("π Professional Keyword Extraction Tool")
|
| 67 |
-
st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores.")
|
| 68 |
|
| 69 |
url = st.text_input("π Enter the article URL here:")
|
| 70 |
diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
|
| 71 |
|
| 72 |
if st.button("Extract Keywords"):
|
| 73 |
if url:
|
| 74 |
-
title, content, keywords,
|
| 75 |
|
| 76 |
st.subheader("π Article Title")
|
| 77 |
st.write(title)
|
|
@@ -84,7 +96,9 @@ if st.button("Extract Keywords"):
|
|
| 84 |
st.text(keywords_str)
|
| 85 |
|
| 86 |
st.subheader("π Keywords Bar Chart")
|
| 87 |
-
st.image(
|
|
|
|
|
|
|
|
|
|
| 88 |
else:
|
| 89 |
st.warning("Please enter a URL to extract keywords.")
|
| 90 |
-
|
|
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
import jieba
|
| 4 |
from keybert import KeyBERT
|
| 5 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
from matplotlib.font_manager import FontProperties
|
| 8 |
import streamlit as st
|
| 9 |
import pandas as pd
|
| 10 |
+
from wordcloud import WordCloud
|
| 11 |
|
| 12 |
# Download font
|
| 13 |
def download_font(url, save_path):
|
|
|
|
| 48 |
plt.savefig('/tmp/keywords_plot.png')
|
| 49 |
return '/tmp/keywords_plot.png'
|
| 50 |
|
| 51 |
+
# Generate word cloud
|
| 52 |
+
def generate_word_cloud(text):
|
| 53 |
+
tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
|
| 54 |
+
tfidf_matrix = tfidf_vectorizer.fit_transform([text])
|
| 55 |
+
tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
|
| 56 |
+
|
| 57 |
+
wordcloud = WordCloud(font_path=font_path, background_color='white').generate_from_frequencies(tfidf_scores)
|
| 58 |
+
wordcloud.to_file('/tmp/wordcloud.png')
|
| 59 |
+
return '/tmp/wordcloud.png'
|
| 60 |
+
|
| 61 |
# Function to scrape content and extract keywords
|
| 62 |
def scrape_and_extract(url, diversity):
|
| 63 |
response = requests.get(url)
|
|
|
|
| 68 |
paragraphs = content_div.find_all('p')
|
| 69 |
content = '\n'.join([p.text.strip() for p in paragraphs])
|
| 70 |
keywords = extract_keywords(content, diversity)
|
| 71 |
+
keyword_plot_path = plot_keywords(keywords, "Keyword Extraction Results")
|
| 72 |
+
wordcloud_path = generate_word_cloud(content)
|
| 73 |
+
return title, content, keywords, keyword_plot_path, wordcloud_path
|
| 74 |
|
| 75 |
# Streamlit Interface
|
| 76 |
st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="π")
|
| 77 |
|
| 78 |
st.title("π Professional Keyword Extraction Tool")
|
| 79 |
+
st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores. Additionally, a word cloud is generated based on TF-IDF scores.")
|
| 80 |
|
| 81 |
url = st.text_input("π Enter the article URL here:")
|
| 82 |
diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
|
| 83 |
|
| 84 |
if st.button("Extract Keywords"):
|
| 85 |
if url:
|
| 86 |
+
title, content, keywords, keyword_plot_path, wordcloud_path = scrape_and_extract(url, diversity)
|
| 87 |
|
| 88 |
st.subheader("π Article Title")
|
| 89 |
st.write(title)
|
|
|
|
| 96 |
st.text(keywords_str)
|
| 97 |
|
| 98 |
st.subheader("π Keywords Bar Chart")
|
| 99 |
+
st.image(keyword_plot_path)
|
| 100 |
+
|
| 101 |
+
st.subheader("βοΈ Word Cloud")
|
| 102 |
+
st.image(wordcloud_path)
|
| 103 |
else:
|
| 104 |
st.warning("Please enter a URL to extract keywords.")
|
|
|