Spaces:

Collegitestech
/

Youtube-analytics-dashboard

Sleeping

App Files Files Community

Rahul-Sainy commited on Mar 1, 2025

Commit

422e54a

verified ·

1 Parent(s): f5daef7

Upload 5 files

Browse files

Files changed (5) hide show

Home.py +395 -0
analyze_comments.py +115 -0
channelDataExtraction.py +36 -0
channelVideoDataExtraction.py +255 -0
requirements.txt +14 -0

Home.py ADDED Viewed

	@@ -0,0 +1,395 @@

+import datetime
+import streamlit as st
+import io
+import plotly.express as px
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+import plotly.graph_objects as go
+from streamlit_extras.metric_cards import style_metric_cards
+from streamlit_extras.chart_container import chart_container
+from streamlit_extras.switch_page_button import switch_page
+from streamlit_extras.app_logo import add_logo
+from prophet import Prophet
+from channelDataExtraction import getChannelData
+from channelVideoDataExtraction import *
+########################################################################################################################
+#                                               FUNCTIONS
+########################################################################################################################
+@st.cache_data
+def download_data(api_key, channel_id):
+    channel_details = getChannelData(api_key, channel_id)
+    # check if bad channel id
+    if channel_details is None:
+        return None, None, None, None
+    videos = getVideoList(api_key, channel_details["uploads"])
+    videos_df = pd.DataFrame(videos)
+    video_ids = [video['id'] for video in videos if video['id'] is not None]
+    all_video_data = buildVideoListDataframe(api_key, video_ids)
+    st.session_state.start_index = 0
+    st.session_state.end_index = 10
+    st.session_state['video_id'] = None
+    st.session_state.all_video_df = all_video_data
+    st.session_state.api_key = st.session_state.API_KEY
+    return channel_details, videos, all_video_data, videos_df
+def display_video_list(video_data, start_index, end_index, search_query=None):
+    """Displays a list of videos in a tabular format with custom column order and buttons."""
+    # Input widget for searching videos by title
+    if search_query is None:
+        search_query = ""
+    new_search_query = st.text_input("Search Videos by Title", search_query)
+    # Initialize start_index and end_index in session_state
+    if 'start_index' not in st.session_state:
+        st.session_state.start_index = start_index
+    if 'end_index' not in st.session_state:
+        st.session_state.end_index = end_index
+    # If a new search query is entered, reset the start and end indices
+    if new_search_query != search_query:
+        st.session_state.start_index = start_index
+        st.session_state.end_index = end_index
+    # Filter videos based on the search query across the entire video_data list
+    filtered_videos = [video for video in video_data if new_search_query.lower() in video['title'].lower()]
+    # Paginate the filtered results
+    paginated_videos = filtered_videos[st.session_state.start_index:st.session_state.end_index]
+    for video in paginated_videos:
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.image(video['thumbnail'])
+        with col2:
+            st.write(video['id'])
+        with col3:
+            st.write(video['title'])
+        with col4:
+            video_stats = st.button("Check Video Statistics", key=video['id'])
+            if video_stats:
+                st.session_state['video_id'] = video['id']
+                switch_page("video_data")
+    # Display a button to load the next 10 search results
+    if st.session_state.end_index < len(filtered_videos):
+        if st.button('Load next 10 videos', key='load_next'):
+            st.session_state.start_index = st.session_state.end_index
+            st.session_state.end_index += 10
+########################################################################################################################
+#                                       MAIN PAGE CONFIGURATION
+########################################################################################################################
+st.set_page_config(page_title="Youtube Channel Analytics Dashboard",
+                   page_icon="📊",
+                   layout="wide")
+########################################################################################################################
+#                                       SIDE BAR CONFIGURATION
+########################################################################################################################
+st.title("YouTube Analytics Dashboard")
+# Sidebar
+st.sidebar.title("Settings")
+# Sidebar: Enter Channel ID and YouTube API Key
+if 'API_KEY' not in st.session_state:
+    st.session_state.API_KEY = ""
+if 'CHANNEL_ID' not in st.session_state:
+    st.session_state.CHANNEL_ID = ""
+st.session_state.API_KEY = st.sidebar.text_input("Enter your YouTube API Key", st.session_state.API_KEY,
+                                                 type="password")
+st.session_state.CHANNEL_ID = st.sidebar.text_input("Enter the YouTube Channel ID", st.session_state.CHANNEL_ID)
+if not st.session_state.API_KEY or not st.session_state.CHANNEL_ID:
+    st.warning("Please enter your API Key and Channel ID.")
+    # Display the GitHub link for the user manual
+    user_manual_link = "https://github.com/zainmz/Youtube-Channel-Analytics-Dashboard"
+    st.markdown(f"If you need help, please refer to the the GitHub Repository for the [User Manual]({user_manual_link}).")
+    st.stop()
+# Data Refresh Button
+refresh_button = st.sidebar.button("Refresh Data")
+# First Data Load
+channel_details, videos, all_video_data, videos_df = download_data(st.session_state.API_KEY, st.session_state.CHANNEL_ID)
+if channel_details is None:
+    st.warning("Invalid YouTube Channel ID. Please check and enter a valid Channel ID.")
+    st.stop()
+if refresh_button:
+    with st.spinner("Refreshing data..."):
+        channel_details, videos, all_video_data, videos_df = download_data(st.session_state.API_KEY, st.session_state.CHANNEL_ID)
+        if channel_details is None:
+            st.warning("Invalid YouTube Channel ID. Please check and enter a valid Channel ID.")
+            st.stop()
+# Data Filters for fine-tuned data selection
+st.sidebar.title("Data Filters")
+num_videos = st.sidebar.slider("Select Number of Top Videos to Display:", 1, 50, 10)
+# Convert the 'published_date' column to datetime format
+all_video_data['published_date'] = pd.to_datetime(all_video_data['published_date'])
+# Extract min and max publish dates
+min_date = all_video_data['published_date'].min().date()  # Ensure it's a date object
+max_date = all_video_data['published_date'].max().date()  # Ensure it's a date object
+# Sidebar date input
+start_date = st.sidebar.date_input("Select Start Date", min_date)
+end_date = st.sidebar.date_input("Select End Date", max_date)
+if start_date > end_date:
+    st.sidebar.warning("Start date should be earlier than end date.")
+    st.stop()
+tag_search = st.sidebar.text_input("Search Videos by Tag")
+date_range_start = pd.Timestamp(start_date)
+date_range_end = pd.Timestamp(end_date)
+filtered_data = all_video_data[(all_video_data['published_date'] >= date_range_start) &
+                               (all_video_data['published_date'] <= date_range_end)]
+if tag_search:
+    filtered_data = filtered_data[filtered_data['tags'].apply(lambda x: tag_search in x)]
+########################################################################################################################
+#                                       CHANNEL DETAILS AREA CONFIGURATION
+########################################################################################################################
+# Display channel details
+st.header("Channel Details", divider="green")
+col1, col2, col3 = st.columns(3)
+with col1:
+    channel_thumbnail = channel_details['thumbnail']
+    add_logo(channel_thumbnail, height=300)
+    view_count = int(channel_details['viewCount'])
+    subscriber_count = int(channel_details['subscriberCount'])
+    # Format view count and subscriber count with commas
+    view_count_formatted = "{:,}".format(view_count)
+    subscriber_count_formatted = "{:,}".format(subscriber_count)
+    st.markdown(f"**Channel Title:** {channel_details['title']}")
+    st.markdown(f"**Channel Description:** {channel_details['description']}")
+with col3:
+    # Go to Channel Button
+    st.link_button("Go to Channel", f"https://www.youtube.com/channel/{st.session_state.CHANNEL_ID}")
+col1, col2, col3 = st.columns(3)
+col1.metric("Total Views", view_count_formatted, "")
+col2.metric("Subscribers", subscriber_count_formatted, "")
+col3.metric("Total Videos", len(videos), "")
+style_metric_cards(background_color="#000000",
+                   border_left_color="#049204",
+                   border_color="#0E0E0E"
+                   )
+########################################################################################################################
+#                                            TOP  VIDEO GRAPHS AREA
+########################################################################################################################
+col1, col2, col3 = st.columns(3)
+# Display statistical graphs for the top videos based on views
+with col1:
+    st.subheader(f"Top {num_videos} Videos Based on Views")
+    sorted_video_data = filtered_data.sort_values(by='view_count', ascending=False)
+    # Get the top videos from the sorted DataFrame
+    top_views_df = sorted_video_data.head(num_videos)
+    with chart_container(top_views_df):
+        # Display statistical graphs for the top videos based on views
+        # Create a bar chart using Plotly
+        fig = px.bar(top_views_df, x='title', y='view_count')
+        # Update the layout to rename the axes
+        fig.update_layout(xaxis_title="Video Title",
+                          yaxis_title="View Count")
+        fig.update_traces(marker_color='green')
+        # Display the bar chart in Streamlit
+        st.plotly_chart(fig, use_container_width=True)
+with col2:
+    st.subheader(f"Top {num_videos} Videos Based on Likes")
+    sorted_video_data = filtered_data.sort_values(by='like_count', ascending=False)
+    # Get the top 10 liked videos from the sorted DataFrame
+    top_likes_df = sorted_video_data.head(num_videos)
+    with chart_container(top_likes_df):
+        # Display statistical graphs for the top 10 videos based on views
+        # Create a bar chart using Plotly
+        fig = px.bar(top_likes_df, x='title', y='like_count')
+        # Update the layout to rename the axes
+        fig.update_layout(xaxis_title="Video Title",
+                          yaxis_title="Like Count")
+        fig.update_traces(marker_color='orange')
+        # Display the bar chart in Streamlit
+        st.plotly_chart(fig, use_container_width=True)
+with col3:
+    st.subheader(f"Top {num_videos} Based on Comments")
+    sorted_video_data = filtered_data.sort_values(by='comment_count', ascending=False)
+    # Get the top 10 liked videos from the sorted DataFrame
+    top_comments_df = sorted_video_data.head(num_videos)
+    with chart_container(top_comments_df):
+        # Display statistical graphs for the top 10 videos based on views
+        # Create a bar chart using Plotly
+        fig = px.bar(top_comments_df, x='title', y='comment_count')
+        # Update the layout to rename the axes
+        fig.update_layout(xaxis_title="Video Title",
+                          yaxis_title="Comment Count")
+        fig.update_traces(marker_color='green')
+        # Display the bar chart in Streamlit
+        st.plotly_chart(fig, use_container_width=True)
+########################################################################################################################
+#                                            CHANNEL GROWTH STATS
+########################################################################################################################
+st.subheader("Viewership Growth Over Time", divider="green")
+views = filtered_data['view_count']
+dates = filtered_data['published_date']
+# Creating a time series plot using Plotly
+fig = go.Figure()
+fig.add_trace(
+    go.Scatter(x=dates, y=views, mode='lines+markers', name='Views Over Time', line=dict(color='orange'))
+)
+fig.update_layout(title='Views Over Time',
+                  xaxis_title='Published Date',
+                  yaxis_title='Number of Views',
+                  template="plotly_dark")
+st.plotly_chart(fig, use_container_width=True)
+st.subheader("Predicted Viewership Growth Over Time", divider="green")
+with st.spinner("Predicting Views for the next Week"):
+    # Prepare dataframe for Prophet
+    forecast_df = all_video_data[['published_date', 'view_count']]
+    forecast_df.columns = ['ds', 'y']
+    # Initialize the Prophet model
+    model = Prophet(
+        yearly_seasonality=False,
+        weekly_seasonality=True,
+        daily_seasonality=True,
+        seasonality_mode='additive')
+    # Fit the model with the data
+    model.fit(forecast_df)
+    # Dataframe for future dates
+    future_dates = model.make_future_dataframe(periods=30)
+    # Predict views for the future dates
+    forecast = model.predict(future_dates)
+    # Plot the original data and the forecast
+    # Plotting using Plotly
+    # Filter the forecast dataframe to include only the forecasted period
+    forecasted_period = forecast[forecast['ds'] > forecast_df['ds'].max()]
+    # Plotting using Plotly
+    # Filter the forecast dataframe to include only the forecasted period
+    forecasted_period = forecast[forecast['ds'] > forecast_df['ds'].max()]
+    # Filter the original dataframe to include only the last 30 days
+    last_date = forecast_df['ds'].max()
+    start_date = last_date - datetime.timedelta(days=30)
+    last_30_days = forecast_df[(forecast_df['ds'] > start_date) & (forecast_df['ds'] <= last_date)]
+    # Plotting using Plotly
+    trace1 = go.Scatter(x=last_30_days['ds'], y=last_30_days['y'], mode='lines', name='Actual Views (Last 30 Days)')
+    trace2 = go.Scatter(x=forecasted_period['ds'], y=forecasted_period['yhat'], mode='lines',
+                        name='Predicted Views (Next 30 Days)')
+    layout = go.Layout(title="YouTube Views: Last 30 Days and Forecast for Next 30 Days", xaxis_title="Date",
+                       yaxis_title="Views")
+    fig = go.Figure(data=[trace1, trace2], layout=layout)
+    # Display the combined historical and forecast data in Streamlit using Plotly
+    st.plotly_chart(fig, use_container_width=True)
+########################################################################################################################
+#                                         WORD CLOUD & LIKE TO VIEW RATIO
+########################################################################################################################
+col1, col2 = st.columns(2)
+with col1:
+    st.divider()
+    with st.spinner("Generating Word Cloud..."):
+        st.subheader("Most Common Tags")
+        # Extracting tags from DataFrame and creating a single string
+        all_tags = " ".join(" ".join(tags) for tags in filtered_data['tags'])
+        # Generating the word cloud
+        wordcloud = WordCloud(width=800, height=400, background_color='black').generate(all_tags)
+        # Plotting the word cloud using matplotlib
+        plt.figure(figsize=(10, 5))
+        plt.imshow(wordcloud, interpolation='bilinear')
+        plt.axis('off')
+        plt.tight_layout(pad=0)
+        # Saving the figure to a bytes buffer
+        buf = io.BytesIO()
+        plt.savefig(buf, format="png", bbox_inches='tight', pad_inches=0)
+        buf.seek(0)
+        st.image(buf, use_column_width=True)
+with col2:
+    # Calculating the Like-to-View Ratio
+    filtered_data['like_to_view_ratio'] = filtered_data['like_count'] / filtered_data['view_count']
+    # Extracting the like-to-view ratio and published dates from the dataframe
+    like_to_view_ratio = filtered_data['like_to_view_ratio']
+    st.divider()
+    st.subheader("Like-to-View Ratio Over Time")
+    # Creating a time series plot for Like-to-View Ratio using Plotly
+    fig_ratio = go.Figure()
+    fig_ratio.add_trace(go.Scatter(x=dates, y=like_to_view_ratio, mode='lines+markers', name='Like-to-View Ratio',
+                                   line=dict(color='green')))
+    fig_ratio.update_layout(xaxis_title='Published Date',
+                            yaxis_title='Like-to-View Ratio',
+                            template="plotly_dark")
+    # Display the plot in Streamlit
+    st.plotly_chart(fig_ratio, use_container_width=True)
+########################################################################################################################
+#                                         DETAILED VIDEO STATS SELECTION SECTION
+########################################################################################################################
+st.divider()
+st.subheader("Detailed Video Statistics Video Selection")
+st.write("Click on view statistics to get detailed information related to the selected video")
+# latest 10 videos
+display_video_list(videos, 0, 10)

analyze_comments.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import pandas as pd
+import networkx as nx
+import matplotlib.pyplot as plt
+import numpy as np
+import igraph as ig
+import plotly.subplots as sp
+data = pd.read_excel("all_comments.xlsx")
+def analyze_comments(data):
+    # Reset the graph
+    G = nx.DiGraph()
+    # Add nodes to the graph representing authors
+    for author in data['author'].unique():
+        G.add_node(author)
+    # Add edges to the graph representing replies
+    for _, row in data.dropna(subset=['linkage']).iterrows():
+        # Find the author of the main comment (the comment being replied to)
+        main_comment_authors = data[data['comment_id'] == row['linkage']]['author'].values
+        if main_comment_authors:
+            main_comment_author = main_comment_authors[0]
+            G.add_edge(row['author'], main_comment_author)
+    # Calculate centrality measures again
+    degree_centrality = nx.degree_centrality(G)
+    in_degree_centrality = nx.in_degree_centrality(G)
+    out_degree_centrality = nx.out_degree_centrality(G)
+    betweenness_centrality = nx.betweenness_centrality(G)
+    closeness_centrality = nx.closeness_centrality(G)
+    # Create a DataFrame to display the results
+    centrality_df = pd.DataFrame({
+        'Author': list(degree_centrality.keys()),
+        'Degree Centrality': list(degree_centrality.values()),
+        'In-Degree Centrality': list(in_degree_centrality.values()),
+        'Out-Degree Centrality': list(out_degree_centrality.values()),
+        'Betweenness Centrality': list(betweenness_centrality.values()),
+        'Closeness Centrality': list(closeness_centrality.values())
+    }).sort_values(by='Degree Centrality', ascending=False)
+    print(centrality_df.head(10))
+    centrality_df.head(10).to_excel("centrality.xlsx", index=False)
+    # Select the top N authors based on degree centrality for the subgraph
+    N = 50
+    top_authors = [author for author, _ in
+                   sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True)[:N]]
+    # Extract the subgraph
+    subgraph = G.subgraph(top_authors)
+    # Draw the subgraph
+    fig_subgraph = plt.figure(figsize=(12, 12))
+    pos = nx.spring_layout(subgraph)
+    nx.draw_networkx(subgraph, pos, with_labels=True, node_size=500, node_color='skyblue', font_size=10, alpha=0.6,
+                     edge_color='gray')
+    plt.title("Subgraph of Top 50 Authors based on Degree Centrality")
+    plt.close(fig_subgraph)
+    # Sample a subset of nodes for the subgraph
+    sample_size = 500
+    sampled_nodes = list(G.nodes())[:sample_size]
+    # Extract the subgraph for the sampled nodes
+    sampled_subgraph = G.subgraph(sampled_nodes)
+    # Use the Girvan-Newman algorithm on the sampled subgraph
+    sampled_communities_gn = nx.community.girvan_newman(sampled_subgraph)
+    # Get the first partitioning of communities for the sampled subgraph
+    sampled_first_partition = next(sampled_communities_gn)
+    # Convert the first_partition into a more readable format
+    sampled_community_list_gn = [list(community) for community in sampled_first_partition]
+    # Display the number of detected communities and the size of each community for the sampled subgraph
+    sampled_community_sizes_gn = {f"Sampled Community GN {i + 1}": len(community) for i, community in
+                                  enumerate(sampled_community_list_gn)}
+    no_of_communities = len(sampled_community_sizes_gn)
+    # Generate a new position layout for the nodes in the sampled subgraph
+    sampled_pos = nx.spring_layout(sampled_subgraph)
+    # Helper function to get edges for a community
+    def get_edges(G, community):
+        return [(u, v) for u, v in G.edges() if u in community and v in community]
+    # Visualize the communities in the sampled subgraph
+    fig_communities = plt.figure(figsize=(15, 15))
+    # Get unique colors for each community
+    colors = plt.cm.rainbow(np.linspace(0, 1, len(sampled_community_list_gn)))
+    # Draw nodes and edges with community colors
+    for community, color in zip(sampled_community_list_gn, colors):
+        nx.draw_networkx_nodes(sampled_subgraph, sampled_pos, nodelist=community, node_color=[color] * len(community),
+                               node_size=500)
+        nx.draw_networkx_edges(sampled_subgraph, sampled_pos, edgelist=get_edges(sampled_subgraph, community),
+                               alpha=0.5)
+    # Draw labels for nodes
+    nx.draw_networkx_labels(sampled_subgraph, sampled_pos, font_size=10, font_weight="bold")
+    plt.title("Communities in Sampled Subgraph")
+    plt.axis("off")
+    plt.close(fig_communities)
+    return centrality_df, fig_subgraph, fig_communities, no_of_communities
+# analyze_comments(data)

channelDataExtraction.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import googleapiclient.discovery
+def getChannelData(api_key, channel_id):
+    try:
+        # Create a YouTube API object
+        youtube = googleapiclient.discovery.build("youtube",
+                                                  "v3",
+                                                  developerKey=api_key)
+        # request channel details
+        request = youtube.channels().list(part="snippet,contentDetails,statistics",
+                                          id=channel_id)
+        response = request.execute()
+        # Get the channel details from the response
+        channel = response["items"][0]
+        # channel details dictionary
+        channel_details = {
+            "title": channel["snippet"]["title"],
+            "description": channel["snippet"]["description"],
+            "viewCount": channel["statistics"]["viewCount"],
+            "subscriberCount": channel["statistics"]["subscriberCount"],
+            "uploads": channel['contentDetails']['relatedPlaylists']['uploads'],
+            "thumbnail": channel['snippet']['thumbnails']['medium']['url']
+        }
+        print(channel_details)
+        return channel_details
+    except Exception as error:
+        return None
+#getChannelData(api_key, channel_id)

channelVideoDataExtraction.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import re
+import pandas as pd
+import googleapiclient.discovery
+def getVideoComments(api_key, video_id):
+    # Create a YouTube Data API object
+    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
+    # Make an API request to get all the comments for the video
+    request = youtube.commentThreads().list(part="snippet,replies",
+                                            videoId=video_id,
+                                            maxResults=100,
+                                            textFormat='plainText')
+    response = request.execute()
+    all_comments = []
+    for comment in response['items']:
+        comment_data = {
+            'comment_id': comment['id'],
+            'author': comment["snippet"]["topLevelComment"]['snippet']
+            .get('authorDisplayName', None),
+            'like_count': comment["snippet"]["topLevelComment"]['snippet']
+            .get('likeCount', None),
+            'comment_text': comment["snippet"]["topLevelComment"]['snippet']
+            .get('textOriginal', None),
+            'comment_date': comment["snippet"]["topLevelComment"]['snippet']
+            .get('publishedAt', None),
+        }
+        all_comments.append(comment_data)
+        # Check if there are replies
+        if 'replies' in comment:
+            for reply in comment['replies']['comments']:
+                reply_data = {
+                    'comment_id': reply['id'],
+                    'author': reply['snippet']
+                    .get('authorDisplayName', None),
+                    'comment_text': reply['snippet']
+                    .get('textOriginal', None),
+                    'comment_date': reply['snippet']
+                    .get('publishedAt', None),
+                    'like_count': reply['snippet']
+                    .get('likeCount', None),
+                    'linkage': comment_data['comment_id'],  # Link reply to the main comment
+                }
+                all_comments.append(reply_data)
+    next_page_available = response.get('nextPageToken')
+    is_other_pages = True
+    while is_other_pages:
+        if len(all_comments) == 1000:
+            break
+        if next_page_available is None:
+            is_other_pages = False
+        else:
+            request = youtube.commentThreads() \
+                .list(part="snippet,replies",
+                      videoId=video_id,
+                      maxResults=100,
+                      textFormat='plainText',
+                      pageToken=next_page_available)
+            response = request.execute()
+            for comment in response['items']:
+                comment_data = {
+                    'comment_id': comment['id'],
+                    'author': comment["snippet"]["topLevelComment"]['snippet']
+                    .get('authorDisplayName', None),
+                    'like_count': comment["snippet"]["topLevelComment"]['snippet']
+                    .get('likeCount', None),
+                    'comment_text': comment["snippet"]["topLevelComment"]['snippet']
+                    .get('textOriginal', None),
+                    'comment_date': comment["snippet"]["topLevelComment"]['snippet']
+                    .get('publishedAt', None),
+                }
+                all_comments.append(comment_data)
+                # Check if there are replies
+                if 'replies' in comment:
+                    for reply in comment['replies']['comments']:
+                        reply_data = {
+                            'comment_id': reply['id'],
+                            'author': reply['snippet']
+                            .get('authorDisplayName', None),
+                            'comment_text': reply['snippet']
+                            .get('textOriginal', None),
+                            'comment_date': reply['snippet']
+                            .get('publishedAt', None),
+                            'like_count': reply['snippet']
+                            .get('likeCount', None),
+                            'linkage': comment_data['comment_id'],
+                        }
+                        all_comments.append(reply_data)
+            next_page_available = response.get('nextPageToken')
+    # create the dataframe
+    comment_data = pd.DataFrame(all_comments)
+    # Define the regex pattern for illegal characters
+    # For this example, I'll remove non-printable ASCII characters and the character '𝙄'
+    pattern = r'[^\x20-\x7E]|𝙄'
+    # Remove illegal characters from the entire dataframe
+    comment_data.replace(pattern, '', regex=True, inplace=True)
+    comment_data = comment_data.drop_duplicates()
+    comment_data["like_count"] = comment_data["like_count"]\
+                                 .apply(pd.to_numeric, errors='coerce')
+    # Remove duplicates based on the 'comment_text' column
+    comment_data = comment_data.drop_duplicates(subset='comment_text')
+    # Convert 'published_date' to a pandas datetime object
+    comment_data['comment_date'] = pd.to_datetime(comment_data['comment_date'])
+    # Format 'published_date' with AM/PM in the timezone
+    comment_data['comment_date'] = comment_data['comment_date']\
+                                   .dt.strftime('%Y-%m-%d %I:%M:%S')
+    # Sort the DataFrame by "like_count" in descending order
+    comment_data = comment_data.sort_values(by="like_count", ascending=False)
+    # Reset the index
+    comment_data.reset_index(drop=True, inplace=True)
+    comment_data.to_excel("all_comments.xlsx", index=False)
+    print(comment_data.head(5))
+    return comment_data
+def getVideoList(api_key, playlist_id):
+    # Create a YouTube API object
+    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
+    request = youtube.playlistItems().list(part="contentDetails,snippet",
+                                           playlistId=playlist_id,
+                                           maxResults=50)
+    response = request.execute()
+    all_videos = []
+    for vid in response['items']:
+        vid_stats = {
+            'id': vid['contentDetails'].get('videoId', None),
+            'title': vid['snippet'].get('title', None),
+            'thumbnail': vid['snippet']['thumbnails']['default']['url']
+        }
+        all_videos.append(vid_stats)
+    next_page_available = response.get('nextPageToken')
+    is_next_pages = True
+    while is_next_pages:
+        if next_page_available is None:
+            is_next_pages = False
+        else:
+            request = youtube.playlistItems().list(part="contentDetails,snippet",
+                                                   playlistId=playlist_id,
+                                                   maxResults=50,
+                                                   pageToken=next_page_available)
+            response = request.execute()
+            for vid in response['items']:
+                vid_stats = {
+                    'id': vid['contentDetails'].get('videoId', None),
+                    'title': vid['snippet'].get('title', None),
+                    'thumbnail': vid['snippet']['thumbnails']['default']['url']
+                }
+                all_videos.append(vid_stats)
+            next_page_available = response.get('nextPageToken')
+    # print(all_videos)
+    return all_videos
+def buildVideoListDataframe(api_key, video_ids):
+    youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=api_key)
+    all_vids_stats = []
+    for i in range(0, len(video_ids), 50):
+        request = youtube.videos().list(
+            part='snippet,contentDetails,statistics',
+            id=','.join(video_ids[i:i + 50]))
+        response = request.execute()
+        for vid in response['items']:
+            thumbnail_url = vid['snippet']['thumbnails'].get('standard', {}).get('url', None)
+            vid_stats = {
+                'id': vid.get('id', None),
+                'title': vid['snippet'].get('title', None),
+                'published_date': vid['snippet'].get('publishedAt', None),
+                'tags': vid['snippet'].get('tags', []),
+                'duration': vid['contentDetails'].get('duration', None),
+                'view_count': vid['statistics'].get('viewCount', None),
+                'like_count': vid['statistics'].get('likeCount', None),
+                'favorite_count': vid['statistics'].get('favoriteCount', None),
+                'comment_count': vid['statistics'].get('commentCount', None),
+                'thumbnail': thumbnail_url
+            }
+            all_vids_stats.append(vid_stats)
+    # create the dataframe
+    vids_info = pd.DataFrame(all_vids_stats)
+    # Convert columns to numeric
+    numeric_columns = ['comment_count', 'like_count', 'view_count']
+    vids_info[numeric_columns] = vids_info[numeric_columns]\
+                                  .apply(pd.to_numeric, errors='coerce')
+    # Function to convert ISO 8601 duration to minutes
+    def iso8601_duration_to_minutes(duration):
+        minutes_match = re.search(r'(\d+)M', duration)
+        seconds_match = re.search(r'(\d+)S', duration)
+        # Get the minutes and seconds values, or default to 0 if they are not found.
+        minutes = int(minutes_match.group(1)) if minutes_match else 0
+        seconds = int(seconds_match.group(1)) if seconds_match else 0
+        # Calculate the total duration in minutes.
+        total_minutes = minutes + seconds / 60.0
+        return total_minutes
+    # Apply the conversion function to the 'duration' column
+    vids_info['duration_minutes'] = vids_info['duration']\
+                                     .apply(iso8601_duration_to_minutes)
+    # Convert 'published_date' to a pandas datetime object
+    vids_info['published_date'] = pd.to_datetime(vids_info['published_date'])
+    # Format 'published_date'
+    vids_info['published_date'] = vids_info['published_date']\
+                                   .dt.strftime('%Y-%m-%d %I:%M:%S')
+    vids_info.to_excel("all_vids_info.xlsx", index=False)
+    print(vids_info.head(5))
+    return vids_info
+# video_ids = getVideoList(API_KEY, playlist_id)
+# video_ids = [video['id'] for video in video_ids if video['id'] is not None]
+# buildVideoListDataframe(API_KEY, video_ids)
+#getVideoComments(api_key, "video_id")

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+numpy==1.26.0
+streamlit==1.27.0
+plotly==5.17.0
+textblob==0.17.1
+pandas==2.1.1
+matplotlib==3.8.0
+wordcloud==1.9.2
+prophet==1.1.4
+networkx==3.1
+igraph==0.10.8
+streamlit_extras==0.3.2
+openpyxl==3.1.2
+google-api-python-client~=2.102.0
+scipy~=1.11.2