import requests from bs4 import BeautifulSoup import pandas as pd import jieba from keybert import KeyBERT from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import streamlit as st import matplotlib.pyplot as plt from wordcloud import WordCloud import io # Tokenizer def jieba_tokenizer(text): return jieba.lcut(text) # Initialize KeyBERT models vectorizer = CountVectorizer(tokenizer=jieba_tokenizer) kw_model = KeyBERT() kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1') # Extract keywords using MMR def extract_keywords(doc, diversity): keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity) return keywords # Extract multilingual keywords def extract_multilingual_keywords(doc, diversity): keywords = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity) return keywords # Plot keywords def plot_keywords(keywords, title): df = pd.DataFrame(keywords, columns=['Word', 'Score']) plt.figure(figsize=(10, 6)) plt.barh(df['Word'], df['Score'], color='#1f77b4') plt.xlabel('Score') plt.title(title) plt.gca().invert_yaxis() plt.tight_layout() img = io.BytesIO() plt.savefig(img, format='png') img.seek(0) plt.close() return img # Generate word cloud def generate_word_cloud(text): tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer) tfidf_matrix = tfidf_vectorizer.fit_transform([text]) tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten())) wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(tfidf_scores) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.tight_layout(pad=0) img = io.BytesIO() plt.savefig(img, format='png') img.seek(0) plt.close() return img # Function to scrape content and extract keywords def scrape_and_extract(url, diversity): response = requests.get(url) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') title = soup.find('h1', {'id': 'caas-lead-header-undefined'}).text.strip() content_div = soup.find('div', {'class': 'caas-body'}) paragraphs = content_div.find_all('p') content = '\n'.join([p.text.strip() for p in paragraphs]) keywords = extract_keywords(content, diversity) keywords_multilingual = extract_multilingual_keywords(content, diversity) keyword_plot = plot_keywords(keywords, "Keyword Extraction Results") keyword_plot_multilingual = plot_keywords(keywords_multilingual, "Multilingual Keyword Extraction Results") wordcloud = generate_word_cloud(content) return title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud # Streamlit Interface st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="🔍") st.title("🔍 Professional Keyword Extraction Tool") st.write("Extracts keywords from a given URL and displays two bar charts of the keywords with their respective scores. Additionally, a word cloud is generated based on TF-IDF scores.") url = st.text_input("🌐 Enter the article URL here:") diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01) if st.button("Extract Keywords"): if url: try: title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud = scrape_and_extract(url, diversity) st.subheader("📄 Article Title") st.write(title) st.subheader("📝 Article Content") st.write(content) st.subheader("🔑 Extracted Keywords") df_keywords = pd.DataFrame(keywords, columns=['Word', 'Score']) st.dataframe(df_keywords) st.subheader("📊 Keywords Bar Chart") st.image(keyword_plot) st.subheader("🔑 Multilingual Extracted Keywords") df_keywords_multilingual = pd.DataFrame(keywords_multilingual, columns=['Word', 'Score']) st.dataframe(df_keywords_multilingual) st.subheader("📊 Multilingual Keywords Bar Chart") st.image(keyword_plot_multilingual) st.subheader("☁️ Word Cloud") st.image(wordcloud) except Exception as e: st.error(f"An error occurred: {str(e)}") else: st.warning("Please enter a URL to extract keywords.")