tjl8 commited on
Commit
656ad3b
·
verified ·
1 Parent(s): 44650fa

Uploading app.py and concept diagram

Browse files
Files changed (2) hide show
  1. Final_Streamlit_App.py +226 -0
  2. concept.jpeg +0 -0
Final_Streamlit_App.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ os.system("pip install matplotlib")
4
+ os.system("pip install nltk")
5
+ os.system("pip install wordcloud")
6
+ os.system("pip install collections")
7
+ os.system("pip install numpy")
8
+ os.system("pip install praw")
9
+ os.system("pip install json")
10
+ os.system("pip install altair")
11
+ os.system("pip install panel")
12
+
13
+ import streamlit as st
14
+ from streamlit.runtime.state import SessionState
15
+
16
+ st.set_page_config(layout="wide", page_title="AskReddit Data Explorer")
17
+
18
+ import os
19
+ import pandas as pd
20
+ import matplotlib.pyplot as plt
21
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
22
+ from wordcloud import WordCloud
23
+ import nltk
24
+ import altair as alt
25
+ from datetime import datetime
26
+ import praw
27
+
28
+ st.markdown("""
29
+ <style>
30
+ .reportview-container {
31
+ background: #EFF7FF; /* light bg */
32
+ }
33
+ .main {
34
+ background: #ffffff;
35
+ padding: 20px;
36
+ border-radius: 10px;
37
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
38
+ }
39
+ h1 {
40
+ color: #FF4500; /* orangered */
41
+ text-align: center;
42
+ padding-bottom: 20px;
43
+ border-bottom: 3px solid #FF4500;
44
+ margin-bottom: 30px;
45
+ }
46
+ h2 {
47
+ color: #336699; /* ui text */
48
+ border-bottom: 1px solid #CEE3F8; /* header */
49
+ padding-bottom: 10px;
50
+ }
51
+ .stButton>button {
52
+ background-color: #FF8b60; /* lighter reddit orange */
53
+ color: white;
54
+ border: none;
55
+ border-radius: 4px;
56
+ padding: 8px 16px;
57
+ transition: background-color 0.3s;
58
+ }
59
+ .stButton>button:hover {
60
+ background-color: #FFA07A; /* even lighter shade */
61
+ box-shadow: 0 2px 4px rgba(255, 139, 96, 0.2);
62
+ }
63
+ .stButton>button:active {
64
+ background-color: #FF8b60; /* return to original shade when clicked */
65
+ }
66
+ </style>
67
+ """, unsafe_allow_html=True)
68
+
69
+ # Streamlit App Title
70
+ st.title("AskReddit Dashboard")
71
+
72
+ # Download NLTK resources
73
+ nltk.download('vader_lexicon', quiet=True)
74
+ nltk.download('stopwords', quiet=True)
75
+
76
+ # Initialize the Reddit API client
77
+ reddit = praw.Reddit(
78
+ client_id="dFhdA1_6NWavjmAEt83r2A",
79
+ client_secret="l4UsnweG9HdHLffJ9ZbAow0-HRocWA",
80
+ user_agent="DataExplorer"
81
+ )
82
+
83
+ @st.cache_data(ttl=3600) # Cache for 1 hour
84
+ def fetch_askreddit_posts(limit=100):
85
+ subreddit = reddit.subreddit("AskReddit")
86
+ posts = []
87
+ for post in subreddit.hot(limit=limit):
88
+ posts.append({
89
+ 'title': post.title,
90
+ 'score': post.score,
91
+ 'id': post.id,
92
+ 'url': post.url,
93
+ 'num_comments': post.num_comments,
94
+ 'num_crossposts': post.num_crossposts,
95
+ 'created_utc': datetime.fromtimestamp(post.created_utc),
96
+ 'author': str(post.author),
97
+ 'upvote_ratio': post.upvote_ratio,
98
+ 'day_of_week': datetime.fromtimestamp(post.created_utc).strftime('%A'),
99
+ 'hour': datetime.fromtimestamp(post.created_utc).hour
100
+ })
101
+ return pd.DataFrame(posts)
102
+
103
+ def refresh_app():
104
+ for key in list(st.session_state.keys()):
105
+ if key != 'df':
106
+ del st.session_state[key]
107
+ st.rerun()
108
+
109
+ if st.button("Refresh App"):
110
+ refresh_app()
111
+
112
+ df = fetch_askreddit_posts()
113
+
114
+ row1_col1, row1_col2 = st.columns(2)
115
+ row2_col1, row2_col2 = st.columns(2)
116
+
117
+ # Plot 1: Number of Posts vs Time of Day
118
+ with row1_col1:
119
+ st.markdown('<div class="plot-container">', unsafe_allow_html=True)
120
+ st.subheader("Number of Posts vs Time of Day")
121
+ days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
122
+ if 'selected_day' not in st.session_state:
123
+ st.session_state.selected_day = 'Monday'
124
+ selected_day = st.selectbox('Select Day of the Week:', days_order, key='selected_day')
125
+ plot_data = df[df['day_of_week'] == selected_day].groupby('hour').size().reset_index(name='count')
126
+
127
+ chart = alt.Chart(plot_data).mark_line(point=True, color='#FF4500').encode(
128
+ x=alt.X('hour:Q', title='Hour of Day'),
129
+ y=alt.Y('count:Q', title='Number of Posts'),
130
+ tooltip=['hour', 'count']
131
+ ).properties(title=f'Posts on {selected_day}', height=300)
132
+
133
+ st.altair_chart(chart, use_container_width=True)
134
+ st.markdown('</div>', unsafe_allow_html=True)
135
+
136
+ # Plot 2: Scatter Plot: Comments vs Score
137
+ with row1_col2:
138
+ st.markdown('<div class="plot-container">', unsafe_allow_html=True)
139
+ st.subheader("Scatter Plot of Comments or Crossposts vs Score")
140
+
141
+ if 'scatter_plot_type' not in st.session_state:
142
+ st.session_state.scatter_plot_type = 'num_comments'
143
+ col1, col2 = st.columns(2)
144
+ with col1:
145
+ if st.button("Number of Comments"):
146
+ st.session_state.scatter_plot_type = 'num_comments'
147
+ with col2:
148
+ if st.button("Number of Crossposts"):
149
+ st.session_state.scatter_plot_type = 'num_crossposts'
150
+
151
+ x_axis = st.session_state.scatter_plot_type
152
+ scatter_chart = alt.Chart(df).mark_circle(color='#FF4500', opacity=0.7).encode(
153
+ x=alt.X(f'{x_axis}:Q', title=x_axis.replace('_', ' ').title()),
154
+ y=alt.Y('score:Q', title='Score'),
155
+ tooltip=['title', x_axis, 'score']
156
+ ).properties(
157
+ height=300,
158
+ title=f'Correlation between {x_axis.replace("_", " ").title()} and Score'
159
+ )
160
+
161
+
162
+ st.altair_chart(scatter_chart, use_container_width=True)
163
+ st.markdown('</div>', unsafe_allow_html=True)
164
+
165
+ # Plot 3: Word Cloud by Sentiment
166
+ with row2_col1:
167
+ st.markdown('<div class="plot-container">', unsafe_allow_html=True)
168
+ st.subheader("Word Cloud by Sentiment")
169
+ analyzer = SentimentIntensityAnalyzer()
170
+ df['sentiment'] = df['title'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
171
+ df['sentiment_category'] = df['sentiment'].apply(
172
+ lambda score: 'positive' if score > 0.05 else 'negative' if score < -0.05 else 'neutral'
173
+ )
174
+ if 'selected_sentiment' not in st.session_state:
175
+ st.session_state.selected_sentiment = 'positive'
176
+ selected_sentiment = st.selectbox("Select Sentiment", ['positive', 'negative', 'neutral'], key='selected_sentiment')
177
+ filtered_titles = " ".join(df[df['sentiment_category'] == selected_sentiment]['title'])
178
+
179
+ wordcloud = WordCloud(
180
+ width=400,
181
+ height=150,
182
+ background_color='white',
183
+ colormap='viridis',
184
+ ).generate(filtered_titles)
185
+ fig, ax = plt.subplots(figsize=(10, 6))
186
+ ax.imshow(wordcloud, interpolation='bilinear')
187
+ ax.axis('off')
188
+ st.pyplot(fig)
189
+ st.markdown('</div>', unsafe_allow_html=True)
190
+
191
+
192
+ # Plot 4: Author Performance Bubble Chart
193
+ with row2_col2:
194
+ st.markdown('<div class="plot-container">', unsafe_allow_html=True)
195
+ st.subheader("Author Performance Bubble Chart")
196
+ bubble_chart = alt.Chart(df).mark_circle().encode(
197
+ x=alt.X('num_comments:Q', title='Number of Comments'),
198
+ y=alt.Y('score:Q', title='Score'),
199
+ size=alt.Size('upvote_ratio:Q', title='Upvote Ratio', scale=alt.Scale(range=[20, 200])),
200
+ color=alt.Color('upvote_ratio:Q', scale=alt.Scale(scheme='reds'), legend=None),
201
+ tooltip=['author', 'num_comments', 'score', 'upvote_ratio']
202
+ ).properties(
203
+ width=600,
204
+ height=350
205
+ )
206
+
207
+ st.altair_chart(bubble_chart, use_container_width=False)
208
+ st.markdown('</div>', unsafe_allow_html=True)
209
+
210
+ # Display raw data
211
+ if st.checkbox("Show Raw Data"):
212
+ st.write(df)
213
+
214
+ # Summary
215
+ st.markdown('<div class="plot-container">', unsafe_allow_html=True)
216
+ st.markdown("### Project Summary")
217
+ st.markdown("This project used dataset from the r/AskReddit subreddit. It includes various fields such as post scores, the number of comments, upvote ratios, and timestamps, providing a comprehensive view of post performance and user engagement. The dataset was explored through several interactive visualizations designed to reveal key insights effectively. A line graph was used to display the number of posts by the hour for each day of the week, with a dropdown menu allowing users to select specific days, making it easy to identify and compare posting patterns such as peak activity hours. A scatter plot illustrated the correlation between post scores and other engagement metrics like the number of comments and crossposts, with interactive radio buttons enabling users to switch the x-axis variable and examine relationships influencing post success. A word cloud incorporating sentiment analysis enriched the exploration by categorizing post titles as positive, negative, or neutral using VADER sentiment analysis. Words were color-coded based on their sentiment, and users could toggle between categories and explore different color palettes, making the visualization both informative and visually appealing. Lastly, a bubble chart provided a unique perspective on individual author performance, with bubbles representing authors, their size reflecting the upvote ratio, and their position showing the number of comments and post scores, offering an engaging way to analyze user contributions.")
218
+ st.markdown('</div>', unsafe_allow_html=True)
219
+
220
+ # Group Members:
221
+ st.markdown("### Group Members")
222
+ st.markdown("1. Cindy Chung")
223
+ st.markdown("2. Irith Chaturvedi")
224
+ st.markdown("3. Ning Gao")
225
+ st.markdown("4. Pia Schwarzinger")
226
+ st.markdown("5. Tanvi Lakhani")
concept.jpeg ADDED