Spaces:

JERNGOC
/

crawler_NLP

Sleeping

App Files Files Community

JERNGOC commited on Aug 5, 2024

Commit

c60d67e

verified ·

1 Parent(s): 109edb2

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -70

app.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import requests
 from bs4 import BeautifulSoup
-import pandas as pd
 import jieba
 from keybert import KeyBERT
 from sklearn.feature_extraction.text import CountVectorizer
-import streamlit as st
 import matplotlib.pyplot as plt
 from matplotlib.font_manager import FontProperties
-# Function to download the font
 def download_font(url, save_path):
     response = requests.get(url)
     with open(save_path, 'wb') as f:
@@ -17,106 +17,73 @@ def download_font(url, save_path):
 # Font URL and path
 font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
 font_path = 'TaipeiSansTCBeta-Regular.ttf'
-# Download and set the font
 download_font(font_url, font_path)
 font_prop = FontProperties(fname=font_path)
-# Function to tokenize text using Jieba
 def jieba_tokenizer(text):
     return jieba.lcut(text)
-# Initialize CountVectorizer and KeyBERT model
 vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
 kw_model = KeyBERT()
-# Function to extract keywords
 def extract_keywords(doc):
     keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer)
     return keywords
-# Function to plot keywords
 def plot_keywords(keywords, title):
     words = [kw[0] for kw in keywords]
     scores = [kw[1] for kw in keywords]
     plt.figure(figsize=(10, 6))
-    plt.barh(words, scores, color='skyblue')
-    plt.xlabel('分數', fontproperties=font_prop)
     plt.title(title, fontproperties=font_prop)
     plt.gca().invert_yaxis()
     plt.xticks(fontproperties=font_prop)
     plt.yticks(fontproperties=font_prop)
-    st.pyplot(plt)
-# Function to scrape the article
-def scrape_article(url):
     response = requests.get(url)
     response.encoding = 'utf-8'
     soup = BeautifulSoup(response.text, 'html.parser')
-    title = soup.find('h1', {'data-test-locator': 'headline'}).text
-    content_paragraphs = soup.select('#caas-art-4a83c85b-31af-331e-9628-8bce28f03bf1 article div div div div div div.caas-content-wrapper div.caas-body p')
-    content = '\n'.join([p.text for p in content_paragraphs])
-    return title, content
-# Streamlit app
-st.set_page_config(page_title="中文關鍵詞提取工具", page_icon="🔍", layout="wide")
-st.title("中文關鍵詞提取工具 🔍")
-st.markdown("""
-<style>
-    .main {
-        background-color: #f5f5f5;
-        padding: 20px;
-    }
-    .stButton>button {
-        background-color: #4CAF50;
-        color: white;
-        border: none;
-        padding: 15px 32px;
-        text-align: center;
-        text-decoration: none;
-        display: inline-block;
-        font-size: 16px;
-        margin: 4px 2px;
-        cursor: pointer;
-    }
-</style>
-""", unsafe_allow_html=True)
-st.markdown("### 請輸入Yahoo新聞文章的URL：")
-url = st.text_input("")
-if st.button("抓取文章並提取關鍵詞 🚀"):
     if url:
-        with st.spinner("正在抓取文章內容..."):
-            title, content = scrape_article(url)
-            st.success("文章抓取成功！")
-        st.markdown("### 文章標題：")
-        st.write(f"**{title}**")
-        st.markdown("### 文章內容：")
         st.write(content)
-        with st.spinner("正在提取關鍵詞..."):
-            keywords = extract_keywords(content)
-            st.success("關鍵詞提取成功！")
-        st.markdown("### 關鍵詞提取結果：")
-        keywords_df = pd.DataFrame(keywords, columns=["關鍵詞", "分數"])
-        st.dataframe(keywords_df.style.background_gradient(cmap='Blues'))
-        plot_keywords(keywords, "關鍵詞提取結果")
-        with st.spinner("使用多語言模型提取關鍵詞..."):
-            kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
-            keywords_multilingual = kw_model_multilingual.extract_keywords(content, vectorizer=vectorizer)
-            st.success("多語言模型關鍵詞提取成功！")
-        st.markdown("### 多語言模型關鍵詞提取結果：")
-        keywords_multilingual_df = pd.DataFrame(keywords_multilingual, columns=["關鍵詞", "分數"])
-        st.dataframe(keywords_multilingual_df.style.background_gradient(cmap='Greens'))
-        plot_keywords(keywords_multilingual, "多語言模型關鍵詞提取結果")
     else:
-        st.error("請輸入文章URL以進行關鍵詞提取。")

 import requests
 from bs4 import BeautifulSoup
 import jieba
 from keybert import KeyBERT
 from sklearn.feature_extraction.text import CountVectorizer
 import matplotlib.pyplot as plt
 from matplotlib.font_manager import FontProperties
+import streamlit as st
+import pandas as pd
+# Download font
 def download_font(url, save_path):
     response = requests.get(url)
     with open(save_path, 'wb') as f:
 # Font URL and path
 font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
 font_path = 'TaipeiSansTCBeta-Regular.ttf'
 download_font(font_url, font_path)
 font_prop = FontProperties(fname=font_path)
+# Tokenizer
 def jieba_tokenizer(text):
     return jieba.lcut(text)
+# Initialize KeyBERT model
 vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
 kw_model = KeyBERT()
+# Extract keywords
 def extract_keywords(doc):
     keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer)
     return keywords
+# Plot keywords
 def plot_keywords(keywords, title):
     words = [kw[0] for kw in keywords]
     scores = [kw[1] for kw in keywords]
     plt.figure(figsize=(10, 6))
+    plt.barh(words, scores, color='#1f77b4')
+    plt.xlabel('Score', fontproperties=font_prop)
     plt.title(title, fontproperties=font_prop)
     plt.gca().invert_yaxis()
     plt.xticks(fontproperties=font_prop)
     plt.yticks(fontproperties=font_prop)
+    plt.savefig('/tmp/keywords_plot.png')
+    return '/tmp/keywords_plot.png'
+# Function to scrape content and extract keywords
+def scrape_and_extract(url):
     response = requests.get(url)
     response.encoding = 'utf-8'
     soup = BeautifulSoup(response.text, 'html.parser')
+    title = soup.find('h1', {'id': 'caas-lead-header-undefined'}).text.strip()
+    content_div = soup.find('div', {'class': 'caas-body'})
+    paragraphs = content_div.find_all('p')
+    content = '\n'.join([p.text.strip() for p in paragraphs])
+    keywords = extract_keywords(content)
+    plot_path = plot_keywords(keywords, "Keyword Extraction Results")
+    return title, content, keywords, plot_path
+# Streamlit Interface
+st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="🔍")
+st.title("🔍 Professional Keyword Extraction Tool")
+st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores.")
+url = st.text_input("🌐 Enter the article URL here:")
+if st.button("Extract Keywords"):
     if url:
+        title, content, keywords, plot_path = scrape_and_extract(url)
+        st.subheader("📄 Article Title")
+        st.write(title)
+        st.subheader("📝 Article Content")
         st.write(content)
+        st.subheader("🔑 Extracted Keywords")
+        keywords_str = '\n'.join([f"{kw[0]}: {kw[1]:.4f}" for kw in keywords])
+        st.text(keywords_str)
+        st.subheader("📊 Keywords Bar Chart")
+        st.image(plot_path)
     else:
+        st.warning("Please enter a URL to extract keywords.")