crawler_NLP / app.py
JERNGOC's picture
Update app.py
bba69fc verified
import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
from keybert import KeyBERT
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import streamlit as st
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import io
# Tokenizer
def jieba_tokenizer(text):
return jieba.lcut(text)
# Initialize KeyBERT models
vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
kw_model = KeyBERT()
kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
# Extract keywords using MMR
def extract_keywords(doc, diversity):
keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
return keywords
# Extract multilingual keywords
def extract_multilingual_keywords(doc, diversity):
keywords = kw_model_multilingual.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
return keywords
# Plot keywords
def plot_keywords(keywords, title):
df = pd.DataFrame(keywords, columns=['Word', 'Score'])
plt.figure(figsize=(10, 6))
plt.barh(df['Word'], df['Score'], color='#1f77b4')
plt.xlabel('Score')
plt.title(title)
plt.gca().invert_yaxis()
plt.tight_layout()
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
plt.close()
return img
# Generate word cloud
def generate_word_cloud(text):
tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform([text])
tfidf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray().flatten()))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(tfidf_scores)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
plt.close()
return img
# Function to scrape content and extract keywords
def scrape_and_extract(url, diversity):
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('h1', {'id': 'caas-lead-header-undefined'}).text.strip()
content_div = soup.find('div', {'class': 'caas-body'})
paragraphs = content_div.find_all('p')
content = '\n'.join([p.text.strip() for p in paragraphs])
keywords = extract_keywords(content, diversity)
keywords_multilingual = extract_multilingual_keywords(content, diversity)
keyword_plot = plot_keywords(keywords, "Keyword Extraction Results")
keyword_plot_multilingual = plot_keywords(keywords_multilingual, "Multilingual Keyword Extraction Results")
wordcloud = generate_word_cloud(content)
return title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud
# Streamlit Interface
st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="πŸ”")
st.title("πŸ” Professional Keyword Extraction Tool")
st.write("Extracts keywords from a given URL and displays two bar charts of the keywords with their respective scores. Additionally, a word cloud is generated based on TF-IDF scores.")
url = st.text_input("🌐 Enter the article URL here:")
diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
if st.button("Extract Keywords"):
if url:
try:
title, content, keywords, keyword_plot, keywords_multilingual, keyword_plot_multilingual, wordcloud = scrape_and_extract(url, diversity)
st.subheader("πŸ“„ Article Title")
st.write(title)
st.subheader("πŸ“ Article Content")
st.write(content)
st.subheader("πŸ”‘ Extracted Keywords")
df_keywords = pd.DataFrame(keywords, columns=['Word', 'Score'])
st.dataframe(df_keywords)
st.subheader("πŸ“Š Keywords Bar Chart")
st.image(keyword_plot)
st.subheader("πŸ”‘ Multilingual Extracted Keywords")
df_keywords_multilingual = pd.DataFrame(keywords_multilingual, columns=['Word', 'Score'])
st.dataframe(df_keywords_multilingual)
st.subheader("πŸ“Š Multilingual Keywords Bar Chart")
st.image(keyword_plot_multilingual)
st.subheader("☁️ Word Cloud")
st.image(wordcloud)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
else:
st.warning("Please enter a URL to extract keywords.")