Spaces:

Manikanta3776
/

GENAI

Build error

App Files Files Community

Manikanta3776 commited on Mar 16, 2025

Commit

ff1df12

verified ·

1 Parent(s): b7c0fe8

Create app.py

Browse files

Files changed (1) hide show

app.py +94 -0

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import streamlit as st
+import pandas as pd
+import re
+import nltk
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+import seaborn as sns
+# Download stopwords
+nltk.download('stopwords')
+stop_words = set(stopwords.words('english'))
+def preprocess_text(text):
+    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
+    text = text.lower()
+    tokens = text.split()
+    tokens = [word for word in tokens if word not in stop_words]
+    return ' '.join(tokens)
+def perform_lda(text_data, num_topics=5):
+    vectorizer = CountVectorizer(stop_words='english')
+    dtm = vectorizer.fit_transform(text_data)
+    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
+    lda.fit(dtm)
+    return lda, vectorizer, dtm
+def plot_wordcloud(term_dict):
+    wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(term_dict)
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.imshow(wordcloud, interpolation='bilinear')
+    ax.axis('off')
+    st.pyplot(fig)
+def plot_topic_proportions(proportions, num_topics):
+    fig, ax = plt.subplots(figsize=(10, 6))
+    ax.bar(range(num_topics), proportions, color='skyblue')
+    ax.set_title("Proportions of Different Topics in Text Data")
+    ax.set_xlabel("Topic")
+    ax.set_ylabel("Proportion")
+    ax.set_xticks(range(num_topics))
+    ax.set_xticklabels([f"Topic {i+1}" for i in range(num_topics)])
+    st.pyplot(fig)
+def print_topics(lda, vectorizer, num_words=10):
+    terms = vectorizer.get_feature_names_out()
+    topics = []
+    for index, topic in enumerate(lda.components_):
+        top_terms_idx = topic.argsort()[-num_words:][::-1]
+        top_terms = [terms[i] for i in top_terms_idx]
+        topics.append(f"Topic #{index + 1}: {', '.join(top_terms)}")
+    return topics
+# Streamlit UI
+st.title("Text Analysis and Topic Modeling")
+st.write("Upload a CSV file containing a column with text data.")
+uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
+if uploaded_file:
+    data = pd.read_csv(uploaded_file)
+    text_column = st.selectbox("Select the text column", data.columns)
+    data['text_clean'] = data[text_column].apply(preprocess_text)
+    st.write("Sample Processed Text:")
+    st.write(data[['text_clean']].head())
+    # Extract key terms
+    vectorizer = CountVectorizer(max_features=50, stop_words='english')
+    X = vectorizer.fit_transform(data['text_clean'])
+    terms = vectorizer.get_feature_names_out()
+    term_frequencies = X.sum(axis=0).A1
+    term_dict = dict(zip(terms, term_frequencies))
+    st.subheader("Word Cloud of Key Terms")
+    plot_wordcloud(term_dict)
+    # Perform LDA
+    num_topics = st.slider("Select number of topics", min_value=2, max_value=10, value=5)
+    lda, vectorizer_lda, dtm = perform_lda(data['text_clean'], num_topics)
+    # Display topics
+    st.subheader("Identified Topics")
+    topics = print_topics(lda, vectorizer_lda)
+    for topic in topics:
+        st.write(topic)
+    # Topic proportions
+    topic_proportions = lda.transform(dtm)
+    avg_topic_proportions = topic_proportions.mean(axis=0)
+    st.subheader("Topic Proportions")
+    plot_topic_proportions(avg_topic_proportions, lda.components_.shape[0])