JEPHONETORRE commited on
Commit
79a97f2
·
1 Parent(s): 135831e
Files changed (2) hide show
  1. app.py +56 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.decomposition import LatentDirichletAllocation
5
+ import matplotlib.pyplot as plt
6
+
7
+ # Title and tabs
8
+ st.title("Topic Modeling for News Articles")
9
+ tab1, tab2 = st.tabs(["LDA Topic Modeling", "About the Dataset"])
10
+
11
+ # First Tab: Topic Modeling
12
+ with tab1:
13
+ st.header("Input Data")
14
+
15
+ # Text input for articles
16
+ st.write("Paste your news articles (one article per line):")
17
+ user_input = st.text_area("Enter articles here", height=200)
18
+
19
+ if st.button("Analyze Topics"):
20
+ if user_input.strip():
21
+ # Convert input into a list of articles
22
+ articles = user_input.split("\n")
23
+ articles = [article.strip() for article in articles if article.strip()]
24
+
25
+ # TF-IDF Vectorization
26
+ vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
27
+ tfidf_matrix = vectorizer.fit_transform(articles)
28
+
29
+ # LDA Topic Modeling
30
+ lda = LatentDirichletAllocation(n_components=5, random_state=42)
31
+ lda.fit(tfidf_matrix)
32
+
33
+ # Display topics
34
+ st.subheader("Identified Topics")
35
+ feature_names = vectorizer.get_feature_names_out()
36
+ for idx, topic in enumerate(lda.components_):
37
+ st.write(f"**Topic {idx + 1}:**", ", ".join([feature_names[i] for i in topic.argsort()[-10:]]))
38
+
39
+ # Visualize topic distribution
40
+ st.subheader("Topic Distribution")
41
+ topic_distribution = lda.transform(tfidf_matrix)
42
+ plt.figure(figsize=(10, 5))
43
+ plt.bar(range(len(topic_distribution[0])), topic_distribution[0])
44
+ plt.xlabel("Topics")
45
+ plt.ylabel("Contribution")
46
+ plt.title("Topic Distribution for the First Article")
47
+ st.pyplot(plt.gcf())
48
+ else:
49
+ st.warning("Please input some articles to analyze.")
50
+
51
+ # Second Tab: About the Dataset
52
+ with tab2:
53
+ st.header("About")
54
+ st.write("This app performs topic modeling on news articles using Latent Dirichlet Allocation (LDA).")
55
+ st.write("Paste articles in the text area, and the app will identify underlying topics.")
56
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas==1.5.3
2
+ numpy>=1.24.3
3
+ scikit-learn>=1.3.0
4
+ matplotlib>=3.7.1
5
+ streamlit>=1.25.0