Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import jieba | |
| from keybert import KeyBERT | |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| from wordcloud import WordCloud | |
| import io | |
| # Tokenizer | |
| def jieba_tokenizer(text): | |
| return jieba.lcut(text) | |
| # Initialize KeyBERT models | |
| vectorizer = CountVectorizer(tokenizer=jieba_tokenizer) | |
| kw_model = KeyBERT() | |
| kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1') | |
| # Extract keywords using MMR | |
| def extract_keywords(doc, diversity): | |
| keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity) | |
| return keywords | |
| # Extract multilingual keywords | |
| def extract_multilingual_keywords(doc, diversity): | |
| keywords = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity) | |
| return keywords | |
| # Plot keywords | |
| def plot_keywords(keywords, title): | |
| df = pd.DataFrame(keywords, columns=['Word', 'Score']) | |
| plt.figure(figsize=(10, 6)) | |
| plt.barh(df['Word'], df['Score'], color='#1f77b4') | |
| plt.xlabel('Score') | |
| plt.title(title) | |
| plt.gca().invert_yaxis() | |
| plt.tight_layout() | |
| img = io.BytesIO() | |
| plt.savefig(img, format='png') | |
| img.seek(0) | |
| plt.close() | |
| return img | |
| # Generate word cloud | |
| def generate_word_cloud(text): | |
| tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer) | |
| tfidf_matrix = tfidf_vectorizer.fit_transform([text]) | |
| tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten())) | |
| wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(tfidf_scores) | |
| plt.figure(figsize=(10, 5)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| plt.tight_layout(pad=0) | |
| img = io.BytesIO() | |
| plt.savefig(img, format='png') | |
| img.seek(0) | |
| plt.close() | |
| return img | |
| # Function to scrape content and extract keywords | |
| def scrape_and_extract(url, diversity): | |
| response = requests.get(url) | |
| response.encoding = 'utf-8' | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| title = soup.find('h1', {'id': 'caas-lead-header-undefined'}).text.strip() | |
| content_div = soup.find('div', {'class': 'caas-body'}) | |
| paragraphs = content_div.find_all('p') | |
| content = '\n'.join([p.text.strip() for p in paragraphs]) | |
| keywords = extract_keywords(content, diversity) | |
| keywords_multilingual = extract_multilingual_keywords(content, diversity) | |
| keyword_plot = plot_keywords(keywords, "Keyword Extraction Results") | |
| keyword_plot_multilingual = plot_keywords(keywords_multilingual, "Multilingual Keyword Extraction Results") | |
| wordcloud = generate_word_cloud(content) | |
| return title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud | |
| # Streamlit Interface | |
| st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="π") | |
| st.title("π Professional Keyword Extraction Tool") | |
| st.write("Extracts keywords from a given URL and displays two bar charts of the keywords with their respective scores. Additionally, a word cloud is generated based on TF-IDF scores.") | |
| url = st.text_input("π Enter the article URL here:") | |
| diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01) | |
| if st.button("Extract Keywords"): | |
| if url: | |
| try: | |
| title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud = scrape_and_extract(url, diversity) | |
| st.subheader("π Article Title") | |
| st.write(title) | |
| st.subheader("π Article Content") | |
| st.write(content) | |
| st.subheader("π Extracted Keywords") | |
| df_keywords = pd.DataFrame(keywords, columns=['Word', 'Score']) | |
| st.dataframe(df_keywords) | |
| st.subheader("π Keywords Bar Chart") | |
| st.image(keyword_plot) | |
| st.subheader("π Multilingual Extracted Keywords") | |
| df_keywords_multilingual = pd.DataFrame(keywords_multilingual, columns=['Word', 'Score']) | |
| st.dataframe(df_keywords_multilingual) | |
| st.subheader("π Multilingual Keywords Bar Chart") | |
| st.image(keyword_plot_multilingual) | |
| st.subheader("βοΈ Word Cloud") | |
| st.image(wordcloud) | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| else: | |
| st.warning("Please enter a URL to extract keywords.") |