Rahul-Sainy commited on
Commit
422e54a
·
verified ·
1 Parent(s): f5daef7

Upload 5 files

Browse files
Home.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+
3
+ import streamlit as st
4
+ import io
5
+ import plotly.express as px
6
+ from wordcloud import WordCloud
7
+ import matplotlib.pyplot as plt
8
+ import plotly.graph_objects as go
9
+
10
+ from streamlit_extras.metric_cards import style_metric_cards
11
+ from streamlit_extras.chart_container import chart_container
12
+ from streamlit_extras.switch_page_button import switch_page
13
+ from streamlit_extras.app_logo import add_logo
14
+
15
+ from prophet import Prophet
16
+
17
+ from channelDataExtraction import getChannelData
18
+ from channelVideoDataExtraction import *
19
+
20
+
21
+ ########################################################################################################################
22
+ # FUNCTIONS
23
+ ########################################################################################################################
24
+ @st.cache_data
25
+ def download_data(api_key, channel_id):
26
+ channel_details = getChannelData(api_key, channel_id)
27
+
28
+ # check if bad channel id
29
+ if channel_details is None:
30
+ return None, None, None, None
31
+
32
+ videos = getVideoList(api_key, channel_details["uploads"])
33
+ videos_df = pd.DataFrame(videos)
34
+ video_ids = [video['id'] for video in videos if video['id'] is not None]
35
+ all_video_data = buildVideoListDataframe(api_key, video_ids)
36
+
37
+ st.session_state.start_index = 0
38
+ st.session_state.end_index = 10
39
+ st.session_state['video_id'] = None
40
+ st.session_state.all_video_df = all_video_data
41
+
42
+ st.session_state.api_key = st.session_state.API_KEY
43
+
44
+ return channel_details, videos, all_video_data, videos_df
45
+
46
+
47
+ def display_video_list(video_data, start_index, end_index, search_query=None):
48
+ """Displays a list of videos in a tabular format with custom column order and buttons."""
49
+
50
+ # Input widget for searching videos by title
51
+ if search_query is None:
52
+ search_query = ""
53
+ new_search_query = st.text_input("Search Videos by Title", search_query)
54
+
55
+ # Initialize start_index and end_index in session_state
56
+ if 'start_index' not in st.session_state:
57
+ st.session_state.start_index = start_index
58
+ if 'end_index' not in st.session_state:
59
+ st.session_state.end_index = end_index
60
+
61
+ # If a new search query is entered, reset the start and end indices
62
+ if new_search_query != search_query:
63
+ st.session_state.start_index = start_index
64
+ st.session_state.end_index = end_index
65
+
66
+ # Filter videos based on the search query across the entire video_data list
67
+ filtered_videos = [video for video in video_data if new_search_query.lower() in video['title'].lower()]
68
+
69
+ # Paginate the filtered results
70
+ paginated_videos = filtered_videos[st.session_state.start_index:st.session_state.end_index]
71
+
72
+ for video in paginated_videos:
73
+ col1, col2, col3, col4 = st.columns(4)
74
+ with col1:
75
+ st.image(video['thumbnail'])
76
+ with col2:
77
+ st.write(video['id'])
78
+ with col3:
79
+ st.write(video['title'])
80
+ with col4:
81
+ video_stats = st.button("Check Video Statistics", key=video['id'])
82
+ if video_stats:
83
+ st.session_state['video_id'] = video['id']
84
+ switch_page("video_data")
85
+
86
+ # Display a button to load the next 10 search results
87
+ if st.session_state.end_index < len(filtered_videos):
88
+ if st.button('Load next 10 videos', key='load_next'):
89
+ st.session_state.start_index = st.session_state.end_index
90
+ st.session_state.end_index += 10
91
+
92
+
93
+ ########################################################################################################################
94
+ # MAIN PAGE CONFIGURATION
95
+ ########################################################################################################################
96
+ st.set_page_config(page_title="Youtube Channel Analytics Dashboard",
97
+ page_icon="📊",
98
+ layout="wide")
99
+
100
+ ########################################################################################################################
101
+ # SIDE BAR CONFIGURATION
102
+ ########################################################################################################################
103
+ st.title("YouTube Analytics Dashboard")
104
+
105
+ # Sidebar
106
+ st.sidebar.title("Settings")
107
+
108
+ # Sidebar: Enter Channel ID and YouTube API Key
109
+ if 'API_KEY' not in st.session_state:
110
+ st.session_state.API_KEY = ""
111
+ if 'CHANNEL_ID' not in st.session_state:
112
+ st.session_state.CHANNEL_ID = ""
113
+
114
+ st.session_state.API_KEY = st.sidebar.text_input("Enter your YouTube API Key", st.session_state.API_KEY,
115
+ type="password")
116
+ st.session_state.CHANNEL_ID = st.sidebar.text_input("Enter the YouTube Channel ID", st.session_state.CHANNEL_ID)
117
+
118
+ if not st.session_state.API_KEY or not st.session_state.CHANNEL_ID:
119
+ st.warning("Please enter your API Key and Channel ID.")
120
+ # Display the GitHub link for the user manual
121
+ user_manual_link = "https://github.com/zainmz/Youtube-Channel-Analytics-Dashboard"
122
+ st.markdown(f"If you need help, please refer to the the GitHub Repository for the [User Manual]({user_manual_link}).")
123
+ st.stop()
124
+
125
+ # Data Refresh Button
126
+ refresh_button = st.sidebar.button("Refresh Data")
127
+
128
+ # First Data Load
129
+ channel_details, videos, all_video_data, videos_df = download_data(st.session_state.API_KEY, st.session_state.CHANNEL_ID)
130
+
131
+ if channel_details is None:
132
+ st.warning("Invalid YouTube Channel ID. Please check and enter a valid Channel ID.")
133
+ st.stop()
134
+
135
+ if refresh_button:
136
+ with st.spinner("Refreshing data..."):
137
+ channel_details, videos, all_video_data, videos_df = download_data(st.session_state.API_KEY, st.session_state.CHANNEL_ID)
138
+
139
+ if channel_details is None:
140
+ st.warning("Invalid YouTube Channel ID. Please check and enter a valid Channel ID.")
141
+ st.stop()
142
+
143
+ # Data Filters for fine-tuned data selection
144
+ st.sidebar.title("Data Filters")
145
+
146
+ num_videos = st.sidebar.slider("Select Number of Top Videos to Display:", 1, 50, 10)
147
+
148
+ # Convert the 'published_date' column to datetime format
149
+ all_video_data['published_date'] = pd.to_datetime(all_video_data['published_date'])
150
+
151
+ # Extract min and max publish dates
152
+ min_date = all_video_data['published_date'].min().date() # Ensure it's a date object
153
+ max_date = all_video_data['published_date'].max().date() # Ensure it's a date object
154
+
155
+ # Sidebar date input
156
+ start_date = st.sidebar.date_input("Select Start Date", min_date)
157
+ end_date = st.sidebar.date_input("Select End Date", max_date)
158
+
159
+ if start_date > end_date:
160
+ st.sidebar.warning("Start date should be earlier than end date.")
161
+ st.stop()
162
+
163
+ tag_search = st.sidebar.text_input("Search Videos by Tag")
164
+
165
+ date_range_start = pd.Timestamp(start_date)
166
+ date_range_end = pd.Timestamp(end_date)
167
+
168
+ filtered_data = all_video_data[(all_video_data['published_date'] >= date_range_start) &
169
+ (all_video_data['published_date'] <= date_range_end)]
170
+
171
+ if tag_search:
172
+ filtered_data = filtered_data[filtered_data['tags'].apply(lambda x: tag_search in x)]
173
+
174
+ ########################################################################################################################
175
+ # CHANNEL DETAILS AREA CONFIGURATION
176
+ ########################################################################################################################
177
+
178
+ # Display channel details
179
+ st.header("Channel Details", divider="green")
180
+
181
+ col1, col2, col3 = st.columns(3)
182
+
183
+ with col1:
184
+ channel_thumbnail = channel_details['thumbnail']
185
+
186
+ add_logo(channel_thumbnail, height=300)
187
+
188
+ view_count = int(channel_details['viewCount'])
189
+ subscriber_count = int(channel_details['subscriberCount'])
190
+
191
+ # Format view count and subscriber count with commas
192
+ view_count_formatted = "{:,}".format(view_count)
193
+ subscriber_count_formatted = "{:,}".format(subscriber_count)
194
+
195
+ st.markdown(f"**Channel Title:** {channel_details['title']}")
196
+ st.markdown(f"**Channel Description:** {channel_details['description']}")
197
+
198
+ with col3:
199
+ # Go to Channel Button
200
+ st.link_button("Go to Channel", f"https://www.youtube.com/channel/{st.session_state.CHANNEL_ID}")
201
+
202
+ col1, col2, col3 = st.columns(3)
203
+ col1.metric("Total Views", view_count_formatted, "")
204
+ col2.metric("Subscribers", subscriber_count_formatted, "")
205
+ col3.metric("Total Videos", len(videos), "")
206
+ style_metric_cards(background_color="#000000",
207
+ border_left_color="#049204",
208
+ border_color="#0E0E0E"
209
+ )
210
+
211
+ ########################################################################################################################
212
+ # TOP VIDEO GRAPHS AREA
213
+ ########################################################################################################################
214
+
215
+ col1, col2, col3 = st.columns(3)
216
+ # Display statistical graphs for the top videos based on views
217
+ with col1:
218
+ st.subheader(f"Top {num_videos} Videos Based on Views")
219
+ sorted_video_data = filtered_data.sort_values(by='view_count', ascending=False)
220
+ # Get the top videos from the sorted DataFrame
221
+ top_views_df = sorted_video_data.head(num_videos)
222
+ with chart_container(top_views_df):
223
+ # Display statistical graphs for the top videos based on views
224
+ # Create a bar chart using Plotly
225
+ fig = px.bar(top_views_df, x='title', y='view_count')
226
+ # Update the layout to rename the axes
227
+ fig.update_layout(xaxis_title="Video Title",
228
+ yaxis_title="View Count")
229
+ fig.update_traces(marker_color='green')
230
+ # Display the bar chart in Streamlit
231
+ st.plotly_chart(fig, use_container_width=True)
232
+
233
+ with col2:
234
+ st.subheader(f"Top {num_videos} Videos Based on Likes")
235
+ sorted_video_data = filtered_data.sort_values(by='like_count', ascending=False)
236
+ # Get the top 10 liked videos from the sorted DataFrame
237
+ top_likes_df = sorted_video_data.head(num_videos)
238
+
239
+ with chart_container(top_likes_df):
240
+ # Display statistical graphs for the top 10 videos based on views
241
+ # Create a bar chart using Plotly
242
+ fig = px.bar(top_likes_df, x='title', y='like_count')
243
+ # Update the layout to rename the axes
244
+ fig.update_layout(xaxis_title="Video Title",
245
+ yaxis_title="Like Count")
246
+ fig.update_traces(marker_color='orange')
247
+ # Display the bar chart in Streamlit
248
+ st.plotly_chart(fig, use_container_width=True)
249
+
250
+ with col3:
251
+ st.subheader(f"Top {num_videos} Based on Comments")
252
+ sorted_video_data = filtered_data.sort_values(by='comment_count', ascending=False)
253
+ # Get the top 10 liked videos from the sorted DataFrame
254
+ top_comments_df = sorted_video_data.head(num_videos)
255
+ with chart_container(top_comments_df):
256
+ # Display statistical graphs for the top 10 videos based on views
257
+ # Create a bar chart using Plotly
258
+ fig = px.bar(top_comments_df, x='title', y='comment_count')
259
+ # Update the layout to rename the axes
260
+ fig.update_layout(xaxis_title="Video Title",
261
+ yaxis_title="Comment Count")
262
+ fig.update_traces(marker_color='green')
263
+ # Display the bar chart in Streamlit
264
+ st.plotly_chart(fig, use_container_width=True)
265
+
266
+ ########################################################################################################################
267
+ # CHANNEL GROWTH STATS
268
+ ########################################################################################################################
269
+
270
+ st.subheader("Viewership Growth Over Time", divider="green")
271
+ views = filtered_data['view_count']
272
+ dates = filtered_data['published_date']
273
+
274
+ # Creating a time series plot using Plotly
275
+ fig = go.Figure()
276
+
277
+ fig.add_trace(
278
+ go.Scatter(x=dates, y=views, mode='lines+markers', name='Views Over Time', line=dict(color='orange'))
279
+ )
280
+
281
+ fig.update_layout(title='Views Over Time',
282
+ xaxis_title='Published Date',
283
+ yaxis_title='Number of Views',
284
+ template="plotly_dark")
285
+
286
+ st.plotly_chart(fig, use_container_width=True)
287
+
288
+ st.subheader("Predicted Viewership Growth Over Time", divider="green")
289
+
290
+ with st.spinner("Predicting Views for the next Week"):
291
+ # Prepare dataframe for Prophet
292
+ forecast_df = all_video_data[['published_date', 'view_count']]
293
+ forecast_df.columns = ['ds', 'y']
294
+
295
+ # Initialize the Prophet model
296
+ model = Prophet(
297
+ yearly_seasonality=False,
298
+ weekly_seasonality=True,
299
+ daily_seasonality=True,
300
+ seasonality_mode='additive')
301
+
302
+ # Fit the model with the data
303
+ model.fit(forecast_df)
304
+
305
+ # Dataframe for future dates
306
+ future_dates = model.make_future_dataframe(periods=30)
307
+
308
+ # Predict views for the future dates
309
+ forecast = model.predict(future_dates)
310
+ # Plot the original data and the forecast
311
+
312
+ # Plotting using Plotly
313
+ # Filter the forecast dataframe to include only the forecasted period
314
+ forecasted_period = forecast[forecast['ds'] > forecast_df['ds'].max()]
315
+
316
+ # Plotting using Plotly
317
+ # Filter the forecast dataframe to include only the forecasted period
318
+ forecasted_period = forecast[forecast['ds'] > forecast_df['ds'].max()]
319
+
320
+ # Filter the original dataframe to include only the last 30 days
321
+ last_date = forecast_df['ds'].max()
322
+ start_date = last_date - datetime.timedelta(days=30)
323
+ last_30_days = forecast_df[(forecast_df['ds'] > start_date) & (forecast_df['ds'] <= last_date)]
324
+
325
+ # Plotting using Plotly
326
+ trace1 = go.Scatter(x=last_30_days['ds'], y=last_30_days['y'], mode='lines', name='Actual Views (Last 30 Days)')
327
+ trace2 = go.Scatter(x=forecasted_period['ds'], y=forecasted_period['yhat'], mode='lines',
328
+ name='Predicted Views (Next 30 Days)')
329
+ layout = go.Layout(title="YouTube Views: Last 30 Days and Forecast for Next 30 Days", xaxis_title="Date",
330
+ yaxis_title="Views")
331
+ fig = go.Figure(data=[trace1, trace2], layout=layout)
332
+
333
+ # Display the combined historical and forecast data in Streamlit using Plotly
334
+ st.plotly_chart(fig, use_container_width=True)
335
+ ########################################################################################################################
336
+ # WORD CLOUD & LIKE TO VIEW RATIO
337
+ ########################################################################################################################
338
+
339
+ col1, col2 = st.columns(2)
340
+
341
+ with col1:
342
+ st.divider()
343
+ with st.spinner("Generating Word Cloud..."):
344
+ st.subheader("Most Common Tags")
345
+ # Extracting tags from DataFrame and creating a single string
346
+ all_tags = " ".join(" ".join(tags) for tags in filtered_data['tags'])
347
+
348
+ # Generating the word cloud
349
+ wordcloud = WordCloud(width=800, height=400, background_color='black').generate(all_tags)
350
+
351
+ # Plotting the word cloud using matplotlib
352
+ plt.figure(figsize=(10, 5))
353
+ plt.imshow(wordcloud, interpolation='bilinear')
354
+ plt.axis('off')
355
+ plt.tight_layout(pad=0)
356
+
357
+ # Saving the figure to a bytes buffer
358
+ buf = io.BytesIO()
359
+ plt.savefig(buf, format="png", bbox_inches='tight', pad_inches=0)
360
+ buf.seek(0)
361
+
362
+ st.image(buf, use_column_width=True)
363
+
364
+ with col2:
365
+ # Calculating the Like-to-View Ratio
366
+ filtered_data['like_to_view_ratio'] = filtered_data['like_count'] / filtered_data['view_count']
367
+
368
+ # Extracting the like-to-view ratio and published dates from the dataframe
369
+ like_to_view_ratio = filtered_data['like_to_view_ratio']
370
+
371
+ st.divider()
372
+ st.subheader("Like-to-View Ratio Over Time")
373
+
374
+ # Creating a time series plot for Like-to-View Ratio using Plotly
375
+ fig_ratio = go.Figure()
376
+
377
+ fig_ratio.add_trace(go.Scatter(x=dates, y=like_to_view_ratio, mode='lines+markers', name='Like-to-View Ratio',
378
+ line=dict(color='green')))
379
+
380
+ fig_ratio.update_layout(xaxis_title='Published Date',
381
+ yaxis_title='Like-to-View Ratio',
382
+ template="plotly_dark")
383
+
384
+ # Display the plot in Streamlit
385
+ st.plotly_chart(fig_ratio, use_container_width=True)
386
+
387
+ ########################################################################################################################
388
+ # DETAILED VIDEO STATS SELECTION SECTION
389
+ ########################################################################################################################
390
+
391
+ st.divider()
392
+ st.subheader("Detailed Video Statistics Video Selection")
393
+ st.write("Click on view statistics to get detailed information related to the selected video")
394
+ # latest 10 videos
395
+ display_video_list(videos, 0, 10)
analyze_comments.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import networkx as nx
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import igraph as ig
6
+ import plotly.subplots as sp
7
+
8
+ data = pd.read_excel("all_comments.xlsx")
9
+
10
+
11
+ def analyze_comments(data):
12
+ # Reset the graph
13
+ G = nx.DiGraph()
14
+
15
+ # Add nodes to the graph representing authors
16
+ for author in data['author'].unique():
17
+ G.add_node(author)
18
+
19
+ # Add edges to the graph representing replies
20
+ for _, row in data.dropna(subset=['linkage']).iterrows():
21
+ # Find the author of the main comment (the comment being replied to)
22
+ main_comment_authors = data[data['comment_id'] == row['linkage']]['author'].values
23
+ if main_comment_authors:
24
+ main_comment_author = main_comment_authors[0]
25
+ G.add_edge(row['author'], main_comment_author)
26
+
27
+ # Calculate centrality measures again
28
+ degree_centrality = nx.degree_centrality(G)
29
+ in_degree_centrality = nx.in_degree_centrality(G)
30
+ out_degree_centrality = nx.out_degree_centrality(G)
31
+ betweenness_centrality = nx.betweenness_centrality(G)
32
+ closeness_centrality = nx.closeness_centrality(G)
33
+
34
+ # Create a DataFrame to display the results
35
+ centrality_df = pd.DataFrame({
36
+ 'Author': list(degree_centrality.keys()),
37
+ 'Degree Centrality': list(degree_centrality.values()),
38
+ 'In-Degree Centrality': list(in_degree_centrality.values()),
39
+ 'Out-Degree Centrality': list(out_degree_centrality.values()),
40
+ 'Betweenness Centrality': list(betweenness_centrality.values()),
41
+ 'Closeness Centrality': list(closeness_centrality.values())
42
+ }).sort_values(by='Degree Centrality', ascending=False)
43
+
44
+ print(centrality_df.head(10))
45
+
46
+ centrality_df.head(10).to_excel("centrality.xlsx", index=False)
47
+
48
+ # Select the top N authors based on degree centrality for the subgraph
49
+ N = 50
50
+ top_authors = [author for author, _ in
51
+ sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True)[:N]]
52
+
53
+ # Extract the subgraph
54
+ subgraph = G.subgraph(top_authors)
55
+
56
+ # Draw the subgraph
57
+ fig_subgraph = plt.figure(figsize=(12, 12))
58
+ pos = nx.spring_layout(subgraph)
59
+ nx.draw_networkx(subgraph, pos, with_labels=True, node_size=500, node_color='skyblue', font_size=10, alpha=0.6,
60
+ edge_color='gray')
61
+
62
+ plt.title("Subgraph of Top 50 Authors based on Degree Centrality")
63
+ plt.close(fig_subgraph)
64
+
65
+ # Sample a subset of nodes for the subgraph
66
+ sample_size = 500
67
+ sampled_nodes = list(G.nodes())[:sample_size]
68
+
69
+ # Extract the subgraph for the sampled nodes
70
+ sampled_subgraph = G.subgraph(sampled_nodes)
71
+
72
+ # Use the Girvan-Newman algorithm on the sampled subgraph
73
+ sampled_communities_gn = nx.community.girvan_newman(sampled_subgraph)
74
+
75
+ # Get the first partitioning of communities for the sampled subgraph
76
+ sampled_first_partition = next(sampled_communities_gn)
77
+
78
+ # Convert the first_partition into a more readable format
79
+ sampled_community_list_gn = [list(community) for community in sampled_first_partition]
80
+
81
+ # Display the number of detected communities and the size of each community for the sampled subgraph
82
+ sampled_community_sizes_gn = {f"Sampled Community GN {i + 1}": len(community) for i, community in
83
+ enumerate(sampled_community_list_gn)}
84
+ no_of_communities = len(sampled_community_sizes_gn)
85
+
86
+ # Generate a new position layout for the nodes in the sampled subgraph
87
+ sampled_pos = nx.spring_layout(sampled_subgraph)
88
+
89
+ # Helper function to get edges for a community
90
+ def get_edges(G, community):
91
+ return [(u, v) for u, v in G.edges() if u in community and v in community]
92
+
93
+ # Visualize the communities in the sampled subgraph
94
+ fig_communities = plt.figure(figsize=(15, 15))
95
+
96
+ # Get unique colors for each community
97
+ colors = plt.cm.rainbow(np.linspace(0, 1, len(sampled_community_list_gn)))
98
+
99
+ # Draw nodes and edges with community colors
100
+ for community, color in zip(sampled_community_list_gn, colors):
101
+ nx.draw_networkx_nodes(sampled_subgraph, sampled_pos, nodelist=community, node_color=[color] * len(community),
102
+ node_size=500)
103
+ nx.draw_networkx_edges(sampled_subgraph, sampled_pos, edgelist=get_edges(sampled_subgraph, community),
104
+ alpha=0.5)
105
+
106
+ # Draw labels for nodes
107
+ nx.draw_networkx_labels(sampled_subgraph, sampled_pos, font_size=10, font_weight="bold")
108
+
109
+ plt.title("Communities in Sampled Subgraph")
110
+ plt.axis("off")
111
+ plt.close(fig_communities)
112
+
113
+ return centrality_df, fig_subgraph, fig_communities, no_of_communities
114
+
115
+ # analyze_comments(data)
channelDataExtraction.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import googleapiclient.discovery
2
+
3
+
4
+ def getChannelData(api_key, channel_id):
5
+ try:
6
+ # Create a YouTube API object
7
+ youtube = googleapiclient.discovery.build("youtube",
8
+ "v3",
9
+ developerKey=api_key)
10
+ # request channel details
11
+ request = youtube.channels().list(part="snippet,contentDetails,statistics",
12
+ id=channel_id)
13
+ response = request.execute()
14
+
15
+ # Get the channel details from the response
16
+ channel = response["items"][0]
17
+
18
+ # channel details dictionary
19
+ channel_details = {
20
+ "title": channel["snippet"]["title"],
21
+ "description": channel["snippet"]["description"],
22
+ "viewCount": channel["statistics"]["viewCount"],
23
+ "subscriberCount": channel["statistics"]["subscriberCount"],
24
+ "uploads": channel['contentDetails']['relatedPlaylists']['uploads'],
25
+ "thumbnail": channel['snippet']['thumbnails']['medium']['url']
26
+ }
27
+
28
+ print(channel_details)
29
+
30
+ return channel_details
31
+
32
+ except Exception as error:
33
+ return None
34
+
35
+
36
+ #getChannelData(api_key, channel_id)
channelVideoDataExtraction.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import pandas as pd
3
+ import googleapiclient.discovery
4
+
5
+
6
+ def getVideoComments(api_key, video_id):
7
+ # Create a YouTube Data API object
8
+ youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
9
+
10
+ # Make an API request to get all the comments for the video
11
+ request = youtube.commentThreads().list(part="snippet,replies",
12
+ videoId=video_id,
13
+ maxResults=100,
14
+ textFormat='plainText')
15
+ response = request.execute()
16
+
17
+ all_comments = []
18
+
19
+ for comment in response['items']:
20
+ comment_data = {
21
+ 'comment_id': comment['id'],
22
+ 'author': comment["snippet"]["topLevelComment"]['snippet']
23
+ .get('authorDisplayName', None),
24
+ 'like_count': comment["snippet"]["topLevelComment"]['snippet']
25
+ .get('likeCount', None),
26
+ 'comment_text': comment["snippet"]["topLevelComment"]['snippet']
27
+ .get('textOriginal', None),
28
+ 'comment_date': comment["snippet"]["topLevelComment"]['snippet']
29
+ .get('publishedAt', None),
30
+ }
31
+
32
+ all_comments.append(comment_data)
33
+
34
+ # Check if there are replies
35
+ if 'replies' in comment:
36
+ for reply in comment['replies']['comments']:
37
+ reply_data = {
38
+ 'comment_id': reply['id'],
39
+ 'author': reply['snippet']
40
+ .get('authorDisplayName', None),
41
+ 'comment_text': reply['snippet']
42
+ .get('textOriginal', None),
43
+ 'comment_date': reply['snippet']
44
+ .get('publishedAt', None),
45
+ 'like_count': reply['snippet']
46
+ .get('likeCount', None),
47
+ 'linkage': comment_data['comment_id'], # Link reply to the main comment
48
+ }
49
+ all_comments.append(reply_data)
50
+
51
+ next_page_available = response.get('nextPageToken')
52
+ is_other_pages = True
53
+
54
+ while is_other_pages:
55
+ if len(all_comments) == 1000:
56
+ break
57
+ if next_page_available is None:
58
+ is_other_pages = False
59
+ else:
60
+ request = youtube.commentThreads() \
61
+ .list(part="snippet,replies",
62
+ videoId=video_id,
63
+ maxResults=100,
64
+ textFormat='plainText',
65
+ pageToken=next_page_available)
66
+ response = request.execute()
67
+
68
+ for comment in response['items']:
69
+ comment_data = {
70
+ 'comment_id': comment['id'],
71
+ 'author': comment["snippet"]["topLevelComment"]['snippet']
72
+ .get('authorDisplayName', None),
73
+ 'like_count': comment["snippet"]["topLevelComment"]['snippet']
74
+ .get('likeCount', None),
75
+ 'comment_text': comment["snippet"]["topLevelComment"]['snippet']
76
+ .get('textOriginal', None),
77
+ 'comment_date': comment["snippet"]["topLevelComment"]['snippet']
78
+ .get('publishedAt', None),
79
+ }
80
+
81
+ all_comments.append(comment_data)
82
+
83
+ # Check if there are replies
84
+ if 'replies' in comment:
85
+ for reply in comment['replies']['comments']:
86
+ reply_data = {
87
+ 'comment_id': reply['id'],
88
+ 'author': reply['snippet']
89
+ .get('authorDisplayName', None),
90
+ 'comment_text': reply['snippet']
91
+ .get('textOriginal', None),
92
+ 'comment_date': reply['snippet']
93
+ .get('publishedAt', None),
94
+ 'like_count': reply['snippet']
95
+ .get('likeCount', None),
96
+ 'linkage': comment_data['comment_id'],
97
+ }
98
+ all_comments.append(reply_data)
99
+
100
+ next_page_available = response.get('nextPageToken')
101
+
102
+ # create the dataframe
103
+ comment_data = pd.DataFrame(all_comments)
104
+
105
+ # Define the regex pattern for illegal characters
106
+ # For this example, I'll remove non-printable ASCII characters and the character '𝙄'
107
+ pattern = r'[^\x20-\x7E]|𝙄'
108
+
109
+ # Remove illegal characters from the entire dataframe
110
+ comment_data.replace(pattern, '', regex=True, inplace=True)
111
+
112
+ comment_data = comment_data.drop_duplicates()
113
+ comment_data["like_count"] = comment_data["like_count"]\
114
+ .apply(pd.to_numeric, errors='coerce')
115
+
116
+ # Remove duplicates based on the 'comment_text' column
117
+ comment_data = comment_data.drop_duplicates(subset='comment_text')
118
+
119
+ # Convert 'published_date' to a pandas datetime object
120
+ comment_data['comment_date'] = pd.to_datetime(comment_data['comment_date'])
121
+
122
+ # Format 'published_date' with AM/PM in the timezone
123
+ comment_data['comment_date'] = comment_data['comment_date']\
124
+ .dt.strftime('%Y-%m-%d %I:%M:%S')
125
+
126
+ # Sort the DataFrame by "like_count" in descending order
127
+ comment_data = comment_data.sort_values(by="like_count", ascending=False)
128
+ # Reset the index
129
+ comment_data.reset_index(drop=True, inplace=True)
130
+
131
+ comment_data.to_excel("all_comments.xlsx", index=False)
132
+
133
+ print(comment_data.head(5))
134
+
135
+ return comment_data
136
+
137
+
138
+ def getVideoList(api_key, playlist_id):
139
+ # Create a YouTube API object
140
+ youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
141
+
142
+ request = youtube.playlistItems().list(part="contentDetails,snippet",
143
+ playlistId=playlist_id,
144
+ maxResults=50)
145
+ response = request.execute()
146
+
147
+ all_videos = []
148
+
149
+ for vid in response['items']:
150
+ vid_stats = {
151
+ 'id': vid['contentDetails'].get('videoId', None),
152
+ 'title': vid['snippet'].get('title', None),
153
+ 'thumbnail': vid['snippet']['thumbnails']['default']['url']
154
+ }
155
+ all_videos.append(vid_stats)
156
+
157
+ next_page_available = response.get('nextPageToken')
158
+ is_next_pages = True
159
+
160
+ while is_next_pages:
161
+ if next_page_available is None:
162
+ is_next_pages = False
163
+ else:
164
+ request = youtube.playlistItems().list(part="contentDetails,snippet",
165
+ playlistId=playlist_id,
166
+ maxResults=50,
167
+ pageToken=next_page_available)
168
+ response = request.execute()
169
+
170
+ for vid in response['items']:
171
+ vid_stats = {
172
+ 'id': vid['contentDetails'].get('videoId', None),
173
+ 'title': vid['snippet'].get('title', None),
174
+ 'thumbnail': vid['snippet']['thumbnails']['default']['url']
175
+ }
176
+ all_videos.append(vid_stats)
177
+
178
+ next_page_available = response.get('nextPageToken')
179
+
180
+ # print(all_videos)
181
+ return all_videos
182
+
183
+
184
+ def buildVideoListDataframe(api_key, video_ids):
185
+ youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
186
+
187
+ all_vids_stats = []
188
+
189
+ for i in range(0, len(video_ids), 50):
190
+ request = youtube.videos().list(
191
+ part='snippet,contentDetails,statistics',
192
+ id=','.join(video_ids[i:i + 50]))
193
+ response = request.execute()
194
+
195
+ for vid in response['items']:
196
+ thumbnail_url = vid['snippet']['thumbnails'].get('standard', {}).get('url', None)
197
+
198
+ vid_stats = {
199
+ 'id': vid.get('id', None),
200
+ 'title': vid['snippet'].get('title', None),
201
+ 'published_date': vid['snippet'].get('publishedAt', None),
202
+ 'tags': vid['snippet'].get('tags', []),
203
+ 'duration': vid['contentDetails'].get('duration', None),
204
+ 'view_count': vid['statistics'].get('viewCount', None),
205
+ 'like_count': vid['statistics'].get('likeCount', None),
206
+ 'favorite_count': vid['statistics'].get('favoriteCount', None),
207
+ 'comment_count': vid['statistics'].get('commentCount', None),
208
+ 'thumbnail': thumbnail_url
209
+ }
210
+ all_vids_stats.append(vid_stats)
211
+
212
+ # create the dataframe
213
+ vids_info = pd.DataFrame(all_vids_stats)
214
+ # Convert columns to numeric
215
+ numeric_columns = ['comment_count', 'like_count', 'view_count']
216
+ vids_info[numeric_columns] = vids_info[numeric_columns]\
217
+ .apply(pd.to_numeric, errors='coerce')
218
+
219
+ # Function to convert ISO 8601 duration to minutes
220
+ def iso8601_duration_to_minutes(duration):
221
+ minutes_match = re.search(r'(\d+)M', duration)
222
+ seconds_match = re.search(r'(\d+)S', duration)
223
+
224
+ # Get the minutes and seconds values, or default to 0 if they are not found.
225
+ minutes = int(minutes_match.group(1)) if minutes_match else 0
226
+ seconds = int(seconds_match.group(1)) if seconds_match else 0
227
+
228
+ # Calculate the total duration in minutes.
229
+ total_minutes = minutes + seconds / 60.0
230
+
231
+ return total_minutes
232
+
233
+ # Apply the conversion function to the 'duration' column
234
+ vids_info['duration_minutes'] = vids_info['duration']\
235
+ .apply(iso8601_duration_to_minutes)
236
+
237
+ # Convert 'published_date' to a pandas datetime object
238
+ vids_info['published_date'] = pd.to_datetime(vids_info['published_date'])
239
+
240
+ # Format 'published_date'
241
+ vids_info['published_date'] = vids_info['published_date']\
242
+ .dt.strftime('%Y-%m-%d %I:%M:%S')
243
+
244
+ vids_info.to_excel("all_vids_info.xlsx", index=False)
245
+
246
+ print(vids_info.head(5))
247
+
248
+ return vids_info
249
+
250
+
251
+ # video_ids = getVideoList(API_KEY, playlist_id)
252
+ # video_ids = [video['id'] for video in video_ids if video['id'] is not None]
253
+ # buildVideoListDataframe(API_KEY, video_ids)
254
+
255
+ #getVideoComments(api_key, "video_id")
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.26.0
2
+ streamlit==1.27.0
3
+ plotly==5.17.0
4
+ textblob==0.17.1
5
+ pandas==2.1.1
6
+ matplotlib==3.8.0
7
+ wordcloud==1.9.2
8
+ prophet==1.1.4
9
+ networkx==3.1
10
+ igraph==0.10.8
11
+ streamlit_extras==0.3.2
12
+ openpyxl==3.1.2
13
+ google-api-python-client~=2.102.0
14
+ scipy~=1.11.2