Spaces:

JERNGOC
/

crawler_NLP

Sleeping

App Files Files Community

JERNGOC commited on Aug 5, 2024

Commit

bac1a26

verified ·

1 Parent(s): b628e1b

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -8

app.py CHANGED Viewed

@@ -28,9 +28,9 @@ def jieba_tokenizer(text):
 vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
 kw_model = KeyBERT()
-# Extract keywords
-def extract_keywords(doc):
-    keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer)
     return keywords
 # Plot keywords
@@ -48,7 +48,7 @@ def plot_keywords(keywords, title):
     return '/tmp/keywords_plot.png'
 # Function to scrape content and extract keywords
-def scrape_and_extract(url):
     response = requests.get(url)
     response.encoding = 'utf-8'
     soup = BeautifulSoup(response.text, 'html.parser')
@@ -56,7 +56,7 @@ def scrape_and_extract(url):
     content_div = soup.find('div', {'class': 'caas-body'})
     paragraphs = content_div.find_all('p')
     content = '\n'.join([p.text.strip() for p in paragraphs])
-    keywords = extract_keywords(content)
     plot_path = plot_keywords(keywords, "Keyword Extraction Results")
     return title, content, keywords, plot_path
@@ -67,10 +67,11 @@ st.title("🔍 Professional Keyword Extraction Tool")
 st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores.")
 url = st.text_input("🌐 Enter the article URL here:")
 if st.button("Extract Keywords"):
     if url:
-        title, content, keywords, plot_path = scrape_and_extract(url)
         st.subheader("📄 Article Title")
         st.write(title)
@@ -86,5 +87,3 @@ if st.button("Extract Keywords"):
         st.image(plot_path)
     else:
         st.warning("Please enter a URL to extract keywords.")

 vectorizer = CountVectorizer(tokenizer=jieba_tokenizer)
 kw_model = KeyBERT()
+# Extract keywords using MMR
+def extract_keywords(doc, diversity):
+    keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer, use_mmr=True, diversity=diversity)
     return keywords
 # Plot keywords
     return '/tmp/keywords_plot.png'
 # Function to scrape content and extract keywords
+def scrape_and_extract(url, diversity):
     response = requests.get(url)
     response.encoding = 'utf-8'
     soup = BeautifulSoup(response.text, 'html.parser')
     content_div = soup.find('div', {'class': 'caas-body'})
     paragraphs = content_div.find_all('p')
     content = '\n'.join([p.text.strip() for p in paragraphs])
+    keywords = extract_keywords(content, diversity)
     plot_path = plot_keywords(keywords, "Keyword Extraction Results")
     return title, content, keywords, plot_path
 st.write("Extracts keywords from a given URL and displays a bar chart of the keywords with their respective scores.")
 url = st.text_input("🌐 Enter the article URL here:")
+diversity = st.slider("Adjust Diversity (0.0: Most Relevant, 1.0: Most Diverse)", 0.0, 1.0, 0.5, step=0.01)
 if st.button("Extract Keywords"):
     if url:
+        title, content, keywords, plot_path = scrape_and_extract(url, diversity)
         st.subheader("📄 Article Title")
         st.write(title)
         st.image(plot_path)
     else:
         st.warning("Please enter a URL to extract keywords.")