import streamlit as st import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import LatentDirichletAllocation import matplotlib.pyplot as plt # Title and tabs st.title("Topic Modeling for News Articles") tab1, tab2 = st.tabs(["LDA Topic Modeling", "About the Dataset"]) # First Tab: Topic Modeling with tab1: st.header("Input Data") # Text input for articles st.write("Paste your news articles (one article per line):") user_input = st.text_area("Enter articles here", height=200) if st.button("Analyze Topics"): if user_input.strip(): # Convert input into a list of articles articles = user_input.split("\n") articles = [article.strip() for article in articles if article.strip()] # TF-IDF Vectorization vectorizer = TfidfVectorizer(stop_words='english', max_features=5000) tfidf_matrix = vectorizer.fit_transform(articles) # LDA Topic Modeling lda = LatentDirichletAllocation(n_components=5, random_state=42) lda.fit(tfidf_matrix) # Display topics st.subheader("Identified Topics") feature_names = vectorizer.get_feature_names_out() for idx, topic in enumerate(lda.components_): st.write(f"**Topic {idx + 1}:**", ", ".join([feature_names[i] for i in topic.argsort()[-10:]])) # Visualize topic distribution st.subheader("Topic Distribution") topic_distribution = lda.transform(tfidf_matrix) plt.figure(figsize=(10, 5)) plt.bar(range(len(topic_distribution[0])), topic_distribution[0]) plt.xlabel("Topics") plt.ylabel("Contribution") plt.title("Topic Distribution for the First Article") st.pyplot(plt.gcf()) else: st.warning("Please input some articles to analyze.") # Second Tab: About the Dataset with tab2: st.header("About") st.write("This app performs topic modeling on news articles using Latent Dirichlet Allocation (LDA).") st.write("Paste articles in the text area, and the app will identify underlying topics.")