Manikanta3776 commited on
Commit
ff1df12
·
verified ·
1 Parent(s): b7c0fe8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import re
4
+ import nltk
5
+ from nltk.corpus import stopwords
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
+ from sklearn.decomposition import LatentDirichletAllocation
8
+ import matplotlib.pyplot as plt
9
+ from wordcloud import WordCloud
10
+ import seaborn as sns
11
+
12
+ # Download stopwords
13
+ nltk.download('stopwords')
14
+ stop_words = set(stopwords.words('english'))
15
+
16
+ def preprocess_text(text):
17
+ text = re.sub(r'[^a-zA-Z\s]', '', text, re.I)
18
+ text = text.lower()
19
+ tokens = text.split()
20
+ tokens = [word for word in tokens if word not in stop_words]
21
+ return ' '.join(tokens)
22
+
23
+ def perform_lda(text_data, num_topics=5):
24
+ vectorizer = CountVectorizer(stop_words='english')
25
+ dtm = vectorizer.fit_transform(text_data)
26
+ lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
27
+ lda.fit(dtm)
28
+ return lda, vectorizer, dtm
29
+
30
+ def plot_wordcloud(term_dict):
31
+ wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(term_dict)
32
+ fig, ax = plt.subplots(figsize=(10, 5))
33
+ ax.imshow(wordcloud, interpolation='bilinear')
34
+ ax.axis('off')
35
+ st.pyplot(fig)
36
+
37
+ def plot_topic_proportions(proportions, num_topics):
38
+ fig, ax = plt.subplots(figsize=(10, 6))
39
+ ax.bar(range(num_topics), proportions, color='skyblue')
40
+ ax.set_title("Proportions of Different Topics in Text Data")
41
+ ax.set_xlabel("Topic")
42
+ ax.set_ylabel("Proportion")
43
+ ax.set_xticks(range(num_topics))
44
+ ax.set_xticklabels([f"Topic {i+1}" for i in range(num_topics)])
45
+ st.pyplot(fig)
46
+
47
+ def print_topics(lda, vectorizer, num_words=10):
48
+ terms = vectorizer.get_feature_names_out()
49
+ topics = []
50
+ for index, topic in enumerate(lda.components_):
51
+ top_terms_idx = topic.argsort()[-num_words:][::-1]
52
+ top_terms = [terms[i] for i in top_terms_idx]
53
+ topics.append(f"Topic #{index + 1}: {', '.join(top_terms)}")
54
+ return topics
55
+
56
+ # Streamlit UI
57
+ st.title("Text Analysis and Topic Modeling")
58
+ st.write("Upload a CSV file containing a column with text data.")
59
+
60
+ uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
61
+
62
+ if uploaded_file:
63
+ data = pd.read_csv(uploaded_file)
64
+ text_column = st.selectbox("Select the text column", data.columns)
65
+ data['text_clean'] = data[text_column].apply(preprocess_text)
66
+
67
+ st.write("Sample Processed Text:")
68
+ st.write(data[['text_clean']].head())
69
+
70
+ # Extract key terms
71
+ vectorizer = CountVectorizer(max_features=50, stop_words='english')
72
+ X = vectorizer.fit_transform(data['text_clean'])
73
+ terms = vectorizer.get_feature_names_out()
74
+ term_frequencies = X.sum(axis=0).A1
75
+ term_dict = dict(zip(terms, term_frequencies))
76
+
77
+ st.subheader("Word Cloud of Key Terms")
78
+ plot_wordcloud(term_dict)
79
+
80
+ # Perform LDA
81
+ num_topics = st.slider("Select number of topics", min_value=2, max_value=10, value=5)
82
+ lda, vectorizer_lda, dtm = perform_lda(data['text_clean'], num_topics)
83
+
84
+ # Display topics
85
+ st.subheader("Identified Topics")
86
+ topics = print_topics(lda, vectorizer_lda)
87
+ for topic in topics:
88
+ st.write(topic)
89
+
90
+ # Topic proportions
91
+ topic_proportions = lda.transform(dtm)
92
+ avg_topic_proportions = topic_proportions.mean(axis=0)
93
+ st.subheader("Topic Proportions")
94
+ plot_topic_proportions(avg_topic_proportions, lda.components_.shape[0])