JERNGOC commited on
Commit
c60d67e
ยท
verified ยท
1 Parent(s): 109edb2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -70
app.py CHANGED
@@ -1,14 +1,14 @@
1
  import requests
2
  from bs4 import BeautifulSoup
3
- import pandas as pd
4
  import jieba
5
  from keybert import KeyBERT
6
  from sklearn.feature_extraction.text import CountVectorizer
7
- import streamlit as st
8
  import matplotlib.pyplot as plt
9
  from matplotlib.font_manager import FontProperties
 
 
10
 
11
- # Function to download the font
12
  def download_font(url, save_path):
13
  response = requests.get(url)
14
  with open(save_path, 'wb') as f:
@@ -17,106 +17,73 @@ def download_font(url, save_path):
17
  # Font URL and path
18
  font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
19
  font_path = 'TaipeiSansTCBeta-Regular.ttf'
20
-
21
- # Download and set the font
22
  download_font(font_url, font_path)
23
  font_prop = FontProperties(fname=font_path)
24
 
25
- # Function to tokenize text using Jieba
26
  def jieba_tokenizer(text):
27
  return jieba.lcut(text)
28
 
29
- # Initialize CountVectorizer and KeyBERT model
30
  vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
31
  kw_model = KeyBERT()
32
 
33
- # Function to extract keywords
34
  def extract_keywords(doc):
35
  keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer)
36
  return keywords
37
 
38
- # Function to plot keywords
39
  def plot_keywords(keywords, title):
40
  words = [kw[0] for kw in keywords]
41
  scores = [kw[1] for kw in keywords]
42
  plt.figure(figsize=(10, 6))
43
- plt.barh(words, scores, color='skyblue')
44
- plt.xlabel('ๅˆ†ๆ•ธ', fontproperties=font_prop)
45
  plt.title(title, fontproperties=font_prop)
46
  plt.gca().invert_yaxis()
47
  plt.xticks(fontproperties=font_prop)
48
  plt.yticks(fontproperties=font_prop)
49
- st.pyplot(plt)
 
50
 
51
- # Function to scrape the article
52
- def scrape_article(url):
53
  response = requests.get(url)
54
  response.encoding = 'utf-8'
55
  soup = BeautifulSoup(response.text, 'html.parser')
56
- title = soup.find('h1', {'data-test-locator': 'headline'}).text
57
- content_paragraphs = soup.select('#caas-art-4a83c85b-31af-331e-9628-8bce28f03bf1 article div div div div div div.caas-content-wrapper div.caas-body p')
58
- content = '\n'.join([p.text for p in content_paragraphs])
59
- return title, content
 
 
 
60
 
61
- # Streamlit app
62
- st.set_page_config(page_title="ไธญๆ–‡้—œ้ต่ฉžๆๅ–ๅทฅๅ…ท", page_icon="๐Ÿ”", layout="wide")
63
 
64
- st.title("ไธญๆ–‡้—œ้ต่ฉžๆๅ–ๅทฅๅ…ท ๐Ÿ”")
65
- st.markdown("""
66
- <style>
67
- .main {
68
- background-color: #f5f5f5;
69
- padding: 20px;
70
- }
71
- .stButton>button {
72
- background-color: #4CAF50;
73
- color: white;
74
- border: none;
75
- padding: 15px 32px;
76
- text-align: center;
77
- text-decoration: none;
78
- display: inline-block;
79
- font-size: 16px;
80
- margin: 4px 2px;
81
- cursor: pointer;
82
- }
83
- </style>
84
- """, unsafe_allow_html=True)
85
 
86
- st.markdown("### ่ซ‹่ผธๅ…ฅYahooๆ–ฐ่žๆ–‡็ซ ็š„URL๏ผš")
87
- url = st.text_input("")
88
 
89
- if st.button("ๆŠ“ๅ–ๆ–‡็ซ ไธฆๆๅ–้—œ้ต่ฉž ๐Ÿš€"):
90
  if url:
91
- with st.spinner("ๆญฃๅœจๆŠ“ๅ–ๆ–‡็ซ ๅ…งๅฎน..."):
92
- title, content = scrape_article(url)
93
- st.success("ๆ–‡็ซ ๆŠ“ๅ–ๆˆๅŠŸ๏ผ")
94
 
95
- st.markdown("### ๆ–‡็ซ ๆจ™้กŒ๏ผš")
96
- st.write(f"**{title}**")
97
 
98
- st.markdown("### ๆ–‡็ซ ๅ…งๅฎน๏ผš")
99
  st.write(content)
100
 
101
- with st.spinner("ๆญฃๅœจๆๅ–้—œ้ต่ฉž..."):
102
- keywords = extract_keywords(content)
103
- st.success("้—œ้ต่ฉžๆๅ–ๆˆๅŠŸ๏ผ")
104
 
105
- st.markdown("### ้—œ้ต่ฉžๆๅ–็ตๆžœ๏ผš")
106
- keywords_df = pd.DataFrame(keywords, columns=["้—œ้ต่ฉž", "ๅˆ†ๆ•ธ"])
107
- st.dataframe(keywords_df.style.background_gradient(cmap='Blues'))
108
-
109
- plot_keywords(keywords, "้—œ้ต่ฉžๆๅ–็ตๆžœ")
110
-
111
- with st.spinner("ไฝฟ็”จๅคš่ชž่จ€ๆจกๅž‹ๆๅ–้—œ้ต่ฉž..."):
112
- kw_model_multilingual = KeyBERT(model='distiluse-base-multilingual-cased-v1')
113
- keywords_multilingual = kw_model_multilingual.extract_keywords(content, vectorizer=vectorizer)
114
- st.success("ๅคš่ชž่จ€ๆจกๅž‹้—œ้ต่ฉžๆๅ–ๆˆๅŠŸ๏ผ")
115
-
116
- st.markdown("### ๅคš่ชž่จ€ๆจกๅž‹้—œ้ต่ฉžๆๅ–็ตๆžœ๏ผš")
117
- keywords_multilingual_df = pd.DataFrame(keywords_multilingual, columns=["้—œ้ต่ฉž", "ๅˆ†ๆ•ธ"])
118
- st.dataframe(keywords_multilingual_df.style.background_gradient(cmap='Greens'))
119
-
120
- plot_keywords(keywords_multilingual, "ๅคš่ชž่จ€ๆจกๅž‹้—œ้ต่ฉžๆๅ–็ตๆžœ")
121
  else:
122
- st.error("่ซ‹่ผธๅ…ฅๆ–‡็ซ URLไปฅ้€ฒ่กŒ้—œ้ต่ฉžๆๅ–ใ€‚")
 
 
1
  import requests
2
  from bs4 import BeautifulSoup
 
3
  import jieba
4
  from keybert import KeyBERT
5
  from sklearn.feature_extraction.text import CountVectorizer
 
6
  import matplotlib.pyplot as plt
7
  from matplotlib.font_manager import FontProperties
8
+ import streamlit as st
9
+ import pandas as pd
10
 
11
+ # Download font
12
  def download_font(url, save_path):
13
  response = requests.get(url)
14
  with open(save_path, 'wb') as f:
 
17
  # Font URL and path
18
  font_url = 'https://drive.google.com/uc?id=1eGAsTN1HBpJAkeVM57_C7ccp7hbgSz3_&export=download'
19
  font_path = 'TaipeiSansTCBeta-Regular.ttf'
 
 
20
  download_font(font_url, font_path)
21
  font_prop = FontProperties(fname=font_path)
22
 
23
+ # Tokenizer
24
  def jieba_tokenizer(text):
25
  return jieba.lcut(text)
26
 
27
+ # Initialize KeyBERT model
28
  vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
29
  kw_model = KeyBERT()
30
 
31
+ # Extract keywords
32
  def extract_keywords(doc):
33
  keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer)
34
  return keywords
35
 
36
+ # Plot keywords
37
  def plot_keywords(keywords, title):
38
  words = [kw[0] for kw in keywords]
39
  scores = [kw[1] for kw in keywords]
40
  plt.figure(figsize=(10, 6))
41
+ plt.barh(words, scores, color='#1f77b4')
42
+ plt.xlabel('Score', fontproperties=font_prop)
43
  plt.title(title, fontproperties=font_prop)
44
  plt.gca().invert_yaxis()
45
  plt.xticks(fontproperties=font_prop)
46
  plt.yticks(fontproperties=font_prop)
47
+ plt.savefig('/tmp/keywords_plot.png')
48
+ return '/tmp/keywords_plot.png'
49
 
50
+ # Function to scrape content and extract keywords
51
+ def scrape_and_extract(url):
52
  response = requests.get(url)
53
  response.encoding = 'utf-8'
54
  soup = BeautifulSoup(response.text, 'html.parser')
55
+ title = soup.find('h1', {'id': 'caas-lead-header-undefined'}).text.strip()
56
+ content_div = soup.find('div', {'class': 'caas-body'})
57
+ paragraphs = content_div.find_all('p')
58
+ content = '\n'.join([p.text.strip() for p in paragraphs])
59
+ keywords = extract_keywords(content)
60
+ plot_path = plot_keywords(keywords, "Keyword Extraction Results")
61
+ return title, content, keywords, plot_path
62
 
63
+ # Streamlit Interface
64
+ st.set_page_config(page_title="Professional Keyword Extraction Tool", page_icon="๐Ÿ”")
65
 
66
+ st.title("๐Ÿ” Professional Keyword Extraction Tool")
67
+ st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ url = st.text_input("๐ŸŒ Enter the article URL here:")
 
70
 
71
+ if st.button("Extract Keywords"):
72
  if url:
73
+ title, content, keywords, plot_path = scrape_and_extract(url)
 
 
74
 
75
+ st.subheader("๐Ÿ“„ Article Title")
76
+ st.write(title)
77
 
78
+ st.subheader("๐Ÿ“ Article Content")
79
  st.write(content)
80
 
81
+ st.subheader("๐Ÿ”‘ Extracted Keywords")
82
+ keywords_str = '\n'.join([f"{kw[0]}: {kw[1]:.4f}" for kw in keywords])
83
+ st.text(keywords_str)
84
 
85
+ st.subheader("๐Ÿ“Š Keywords Bar Chart")
86
+ st.image(plot_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  else:
88
+ st.warning("Please enter a URL to extract keywords.")
89
+