Spaces:
Sleeping
Sleeping
File size: 4,729 Bytes
93dd59c bba69fc 0d9b525 b49045a c60d67e bba69fc b49045a bba69fc 0d9b525 c60d67e 0d9b525 6558ced 0d9b525 6558ced 0d9b525 bac1a26 313a206 0d9b525 6558ced c60d67e bba69fc 313a206 bba69fc c08903c 313a206 c08903c bba69fc c08903c bba69fc 313a206 b49045a bba69fc c08903c bba69fc b49045a c60d67e bac1a26 109edb2 c60d67e 6558ced bac1a26 6558ced bba69fc 6558ced bba69fc 109edb2 c60d67e 0d9b525 c60d67e 6558ced 93dd59c c60d67e bac1a26 313a206 c60d67e 313a206 c08903c bba69fc c08903c bba69fc c08903c bba69fc c08903c bba69fc c08903c bba69fc c08903c bba69fc c08903c 5ea2d75 c08903c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import streamlit as st
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import io
# Tokenizer
def jieba_tokenizer(text):
return jieba.lcut(text)
# Initialize KeyBERT models
vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
kw_model = KeyBERT()
kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
# Extract keywords using MMR
def extract_keywords(doc, diversity):
keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
return keywords
# Extract multilingual keywords
def extract_multilingual_keywords(doc, diversity):
keywords = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
return keywords
# Plot keywords
def plot_keywords(keywords, title):
df = pd.DataFrame(keywords, columns=['Word', 'Score'])
plt.figure(figsize=(10, 6))
plt.barh(df['Word'], df['Score'], color='#1f77b4')
plt.xlabel('Score')
plt.title(title)
plt.gca().invert_yaxis()
plt.tight_layout()
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
plt.close()
return img
# Generate word cloud
def generate_word_cloud(text):
tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform([text])
tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(tfidf_scores)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
plt.close()
return img
# Function to scrape content and extract keywords
def scrape_and_extract(url, diversity):
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('h1', {'id': 'caas-lead-header-undefined'}).text.strip()
content_div = soup.find('div', {'class': 'caas-body'})
paragraphs = content_div.find_all('p')
content = '\n'.join([p.text.strip() for p in paragraphs])
keywords = extract_keywords(content, diversity)
keywords_multilingual = extract_multilingual_keywords(content, diversity)
keyword_plot = plot_keywords(keywords, "Keyword Extraction Results")
keyword_plot_multilingual = plot_keywords(keywords_multilingual, "Multilingual Keyword Extraction Results")
wordcloud = generate_word_cloud(content)
return title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud
# Streamlit Interface
st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="π")
st.title("π Professional Keyword Extraction Tool")
st.write("Extracts keywords from a given URL and displays two bar charts of the keywords with their respective scores. Additionally, a word cloud is generated based on TF-IDF scores.")
url = st.text_input("π Enter the article URL here:")
diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
if st.button("Extract Keywords"):
if url:
try:
title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud = scrape_and_extract(url, diversity)
st.subheader("π Article Title")
st.write(title)
st.subheader("π Article Content")
st.write(content)
st.subheader("π Extracted Keywords")
df_keywords = pd.DataFrame(keywords, columns=['Word', 'Score'])
st.dataframe(df_keywords)
st.subheader("π Keywords Bar Chart")
st.image(keyword_plot)
st.subheader("π Multilingual Extracted Keywords")
df_keywords_multilingual = pd.DataFrame(keywords_multilingual, columns=['Word', 'Score'])
st.dataframe(df_keywords_multilingual)
st.subheader("π Multilingual Keywords Bar Chart")
st.image(keyword_plot_multilingual)
st.subheader("βοΈ Word Cloud")
st.image(wordcloud)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
else:
st.warning("Please enter a URL to extract keywords.") |