Spaces:

dylanplummer
/

NextJump-analytics

Running

File size: 17,567 Bytes

from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import (
    DateRange,
    Dimension,
    Metric,
    RunReportRequest,
    RunRealtimeReportRequest
)

import gradio as gr
import os
import json
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

FINISHED_EXERCISE = 'finished_exercise'
PROPERTY_ID = "384068977"

try:
    credentials_json = os.environ['GOOGLE_APPLICATION_CREDENTIALS_JSON']
    credentials_dict = json.loads(credentials_json)
    # write json to file
    with open('credentials.json', 'w') as f:
        json.dump(credentials_dict, f)
    # set env var to filename
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join(os.path.dirname(__file__), 'credentials.json')
except KeyError:  # running locally
    pass
except Exception as e:
    print(f"Error loading credentials: {e}")
    pass

iso = pd.read_csv('iso.tsv', sep='\t')
iso['Alpha-2 code'] = iso['Alpha-2 code'].str.strip()
iso['Alpha-3 code'] = iso['Alpha-3 code'].str.strip()
iso.set_index('Alpha-2 code', inplace=True)
alpha_2_map = iso['Alpha-3 code'].to_dict()

# read counties json file
with open('counties.json') as f:
    counties = json.load(f)

ga_cities = pd.read_csv('cities.csv')
cities = pd.read_csv('uscities.csv')
cities['full_city'] = cities['city'] + ', ' + cities['state_name']
cities.set_index('full_city', inplace=True)
ga_cities['Criteria ID'] = ga_cities['Criteria ID'].astype(str)
ga_cities.set_index('Criteria ID', inplace=True)
ga_city_map = ga_cities['Name'].to_dict()
ga_cities['state'] = ga_cities['Canonical Name'].str.split(',').str[1].str.strip()
ga_state_map = ga_cities['state'].to_dict()
city_county_map = cities['county_fips'].to_dict()
city_county_name_map = cities['county_name'].to_dict()

cached_report = None
cache_time = 0
reload_cache = False
# 6 hours
reload_every = 6 * 60 * 60

def mpl_to_plotly(cmap, pl_entries=11, rdigits=2):
    # cmap - colormap 
    # pl_entries - int = number of Plotly colorscale entries
    # rdigits - int -=number of digits for rounding scale values
    scale = np.linspace(0, 1, pl_entries)
    colors = (cmap(scale)[:, :3]*255).astype(np.uint8)
    pl_colorscale = [[round(s, rdigits), f'rgb{tuple(color)}'] for s, color in zip(scale, colors)]
    return pl_colorscale

def full_report():
    global cached_report, cache_time, reload_cache
    if time.time() - cache_time > reload_every:
        reload_cache = False
    if not reload_cache:
        print("Loading report...")
        reload_cache = True
        cache_time = time.time()
        client = BetaAnalyticsDataClient()

        # first request all data where we have the exercise name
        request = RunReportRequest(
            property=f"properties/{PROPERTY_ID}",
            dimensions=[Dimension(name="nthDay"),
                        Dimension(name='eventName'),
                        Dimension(name="continent"),
                        Dimension(name="country"),
                        Dimension(name="countryId"),
                        Dimension(name="cityId"),
                        Dimension(name="customEvent:exercise")],
            metrics=[Metric(name="eventValue")],
            #return_property_quota=True,
            date_ranges=[DateRange(start_date="2023-06-30", end_date="today")],
        )
        response = client.run_report(request)

        res = {'day': [], 'jumps': [], 'continent': [], 'country': [], 'iso': [], 'cityId': [], 'exercise': []}

        for row in response.rows:
            event_name = row.dimension_values[1].value
            if event_name == FINISHED_EXERCISE:
                day = int(row.dimension_values[0].value)
                continent = row.dimension_values[2].value
                country = row.dimension_values[3].value
                country_iso = row.dimension_values[4].value
                city = row.dimension_values[5].value
                exercise = row.dimension_values[6].value
                event_value = float(row.metric_values[0].value)
                res['day'].append(day)
                res['jumps'].append(event_value)
                res['continent'].append(continent)
                res['country'].append(country)
                res['iso'].append(country_iso)
                res['cityId'].append(city)
                res['exercise'].append(exercise)

        df = pd.DataFrame.from_dict(res)
        # then find the earliest day we started getting exercise name data
        first_day = int(df['day'].min())
        end_date = pd.to_datetime('2023-06-30') + pd.DateOffset(days=first_day)
        # only need YYY-MM-DD
        end_date = str(end_date.strftime('%Y-%m-%d'))
        # then request all data where we don't have the exercise name
        request = RunReportRequest(
            property=f"properties/{PROPERTY_ID}",
            dimensions=[Dimension(name="nthDay"),
                        Dimension(name='eventName'),
                        Dimension(name="continent"),
                        Dimension(name="country"),
                        Dimension(name="countryId"),
                        Dimension(name="cityId")],
            metrics=[Metric(name="eventValue")],
            #return_property_quota=True,
            date_ranges=[DateRange(start_date="2023-06-30", end_date=end_date)],
        )
        response = client.run_report(request)
        res = {'day': [], 'jumps': [], 'continent': [], 'country': [], 'iso': [], 'cityId': [], 'exercise': []}
        for row in response.rows:
            event_name = row.dimension_values[1].value
            if event_name == FINISHED_EXERCISE:
                day = int(row.dimension_values[0].value)
                continent = row.dimension_values[2].value
                country = row.dimension_values[3].value
                country_iso = row.dimension_values[4].value
                city = row.dimension_values[5].value
                event_value = float(row.metric_values[0].value)
                res['day'].append(day)
                res['jumps'].append(event_value)
                res['continent'].append(continent)
                res['country'].append(country)
                res['iso'].append(country_iso)
                res['cityId'].append(city)
                res['exercise'].append('n/a')
        new_df = pd.DataFrame.from_dict(res)
        # drop any rows we already have
        #new_df = new_df[new_df['day'] < first_day]
        df = pd.concat([df, new_df]).reset_index(drop=True)
        df['duration'] = df['exercise'].apply(lambda x: 0 if x in ['n/a', '(not set)'] else int(x[2:]))
        print(df['duration'].sum())
        cached_report = df.copy(deep=True)
    else:
        print("Using cached report...")
        df = cached_report.copy(deep=True)

    total_jumps = int(df['jumps'].sum())
    unique_countries = df['country'].nunique()
    unique_cities = df['cityId'].nunique()

    print(f"Total jumps: {total_jumps}, unique countries: {unique_countries}, unique cities: {unique_cities}")
    df['iso'] = df['iso'].map(alpha_2_map)
    df['jumps'] = df['jumps'].astype(int)
    df['city'] = df['cityId'].map(ga_city_map)
    df['state'] = df['cityId'].map(ga_state_map)
    df['city'] = df.apply(lambda row: row['city'] if row['country'] != 'Bermuda' else 'Hamilton', axis=1)
    df['city'] = df['city'] + ', ' + df['state']

    country_df = df.groupby(['country', 'iso']).sum().reset_index()
    country_df = country_df.sort_values(by=['jumps'], ascending=False)
    top_10_countries = country_df.iloc[:15]['country'].tolist()

    country_df_to_plot = df.groupby(['country', 'iso']).sum().reset_index()
    country_df_to_plot = country_df_to_plot[country_df_to_plot['country'].isin(top_10_countries)].reset_index(drop=True)
    country_df_to_plot = country_df_to_plot.sort_values(by=['jumps'], ascending=True)
    df['rank'] = df['jumps'].rank(ascending=False)
    df['world'] = 'Earth'

    exercise_df = df[~df['exercise'].isin(['n/a', '(not set)'])]
    # plot a bar graph of the most popular exercises and their counts in the dataset
    top_6_events = exercise_df['exercise'].value_counts().reset_index()[:6]
    pop_events = px.bar(top_6_events, 
                        y=top_6_events.index, 
                        x='exercise', 
                        color=top_6_events.index,
                        title='Most Popular Exercises', 
                        template="plotly_dark")
    pop_events.update_layout(showlegend=False)

    total = px.bar(country_df_to_plot, 
                   y='country', x='jumps', 
                   color='country',
                   title='Total Jumps by Country', 
                   orientation='h',
                   category_orders={'country': top_10_countries},
                   height=800,
                   template="plotly_dark")
    total.update_layout(showlegend=False)

    country_df_to_plot_weekly = df[df['day'] >= df['day'].max() - 7].groupby(['country', 'iso']).sum().reset_index()
    country_df_to_plot_weekly = country_df_to_plot_weekly.sort_values(by=['jumps'], ascending=False)
    top_5_weekly = country_df_to_plot_weekly.iloc[:10]['country'].tolist()
    country_df_to_plot_weekly = country_df_to_plot_weekly[country_df_to_plot_weekly['country'].isin(top_5_weekly)].reset_index(drop=True)
    country_df_to_plot_weekly = country_df_to_plot_weekly.sort_values(by=['jumps'], ascending=True)
    total_weekly = px.bar(country_df_to_plot_weekly, 
                   y='country', x='jumps', 
                   color='country',
                   title='Top Countries This Week', 
                   orientation='h',
                   category_orders={'country': top_5_weekly},
                   height=500,
                   template="plotly_dark")
    total_weekly.update_layout(showlegend=False)

    city_df = df.groupby(['city', 'iso']).sum().reset_index()
    city_df = city_df.sort_values(by=['jumps'], ascending=False)
    city_df['city'] = city_df.apply(lambda row: row['city'] + ', ' + row['iso'], axis=1)
    top_10_cities = city_df.iloc[:15]['city'].tolist()
    
    icicle_df = df.groupby(['world', 'continent', 'country', 'state', 'city']).sum().reset_index()
    #icicle_df['log10_jumps'] = icicle_df['jumps'].apply(lambda x: math.log10(x) if x > 0 else 0)

    # icicle = px.icicle(icicle_df, path=['world', 'continent', 'country', 'city'], 
    #                    values='jumps', 
    #                    title='Jumps by Continent/Country', 
    #                    template="plotly_dark", 
    #                    color_continuous_scale='OrRd',
    #                    maxdepth=7,
    #                    branchvalues='remainder',
    #                    color='jumps')

    print(df.columns)
    nipy_spec = mpl_to_plotly(plt.cm.nipy_spectral, pl_entries=15)
    icicle = px.treemap(icicle_df, path=['world', 'continent', 'country', 'state', 'city'], 
                       values='jumps', 
                       title='Jumps by Continent/Country/City (click anywhere!)', 
                       template="plotly_dark", 
                       color_continuous_scale='jet',
                       range_color=[0, np.quantile(icicle_df['jumps'].values, q=0.99)],
                       branchvalues='total',
                       height=800,
                       maxdepth=4,
                       color='jumps')

    city_df = df.groupby(['city', 'iso']).sum().reset_index()
    city_df = city_df[city_df['city'] != '(not set)']
    city_df['city'] = city_df.apply(lambda row: row['city'] + ', ' + row['iso'], axis=1)
    city_df = city_df[city_df['city'].isin(top_10_cities)].reset_index(drop=True)
    city_df = city_df.sort_values(by=['jumps'], ascending=True)

    avg = px.bar(city_df, 
                 y='city', x='jumps', color='city',
                 title='Total Jumps by City', 
                 orientation='h',
                 category_orders={'city': top_10_cities},
                 height=800,
                 template="plotly_dark")
    
    city_df_weekly = df[df['day'] >= df['day'].max() - 7].groupby(['city', 'iso']).sum().reset_index()
    city_df_weekly = city_df_weekly[city_df_weekly['city'] != '(not set)']
    city_df_weekly['city'] = city_df_weekly.apply(lambda row: row['city'] + ', ' + row['iso'], axis=1)
    city_df_weekly = city_df_weekly.sort_values(by=['jumps'], ascending=False)
    top_5_weekly = city_df_weekly.iloc[:10]['city'].tolist()
    city_df_weekly = city_df_weekly[city_df_weekly['city'].isin(top_5_weekly)].reset_index(drop=True)
    city_df_weekly = city_df_weekly.sort_values(by=['jumps'], ascending=True)
    avg_weekly = px.bar(city_df_weekly, 
                 y='city', x='jumps', color='city',
                 title='Top Cities This Week', 
                 orientation='h',
                 category_orders={'city': top_5_weekly},
                 height=500,
                 template="plotly_dark")
    
    avg.update_layout(showlegend=False)
    avg.update(layout_coloraxis_showscale=False)
    avg_weekly.update_layout(showlegend=False)
    avg_weekly.update(layout_coloraxis_showscale=False)

    country_df['rank'] = country_df['jumps'].rank(ascending=False)
    total_map = px.choropleth(country_df, locations="iso",
                                color="rank", 
                                hover_name="country", # column to add to hover information
                                hover_data=["jumps"],
                                color_continuous_scale ="OrRd_r",
                                projection='natural earth',
                                template="plotly_dark")
    # remove the legend
    total_map.update_layout(showlegend=False)
    total_map.update(layout_coloraxis_showscale=False)

    county_df = df.copy()
    county_df['county'] = county_df['city'].map(city_county_map)
    county_df['count_name'] = county_df['city'].map(city_county_name_map)
    county_df = county_df.groupby(['county', 'count_name']).sum().reset_index()
    county_df['rank'] = county_df['jumps'].rank(ascending=False)
    county_df['county'] = county_df['county'].astype(int)
    county_df['county'] = county_df['county'].astype(str).str.zfill(5)  # county codes are two digits for state, three for county
    county_map = px.choropleth(county_df, geojson=counties, locations='county', color='rank',
                           color_continuous_scale="OrRd_r",
                           scope="usa",
                           hover_name="count_name",
                            hover_data=["jumps"],
                           template="plotly_dark"
                          )
    county_map.update_layout(showlegend=False)
    county_map.update(layout_coloraxis_showscale=False)

    df = df.groupby(['day', 'continent']).sum().reset_index()
    df = df.sort_values(by=['day'])
    df['total_jumps'] = df.groupby('continent')['jumps'].cumsum()
    # fill in any missing days with current max value
    for day in range(1, int(df['day'].max()) + 1):
        for continent in df['continent'].unique():
            if not df[(df['day'] == day) & (df['continent'] == continent)].any().any():
                max_jumps = df[(df['day'] < day) & (df['continent'] == continent)]['total_jumps'].max()
                df = pd.concat([df, pd.DataFrame([[day, continent, max_jumps]], columns=['day', 'continent', 'total_jumps'])])
                #df = df.append({'day': day, 'continent': continent, 'total_jumps': max_jumps}, ignore_index=True)
    df = df.sort_values(by=['day']).reset_index(drop=True)
    jumps_over_time = px.area(df, x='day', 
                              y='total_jumps', 
                              color='continent', 
                              template="plotly_dark")
    df.fillna(0, inplace=True)
    daily_df = df.groupby(['day'])[['jumps']].sum().reset_index()
    per_day_plot = px.scatter(daily_df, x='day', y='jumps', 
                              trendline='rolling',
                              trendline_options=dict(window=14),
                              trendline_color_override="goldenrod",
                              trendline_scope='overall',
                              template="plotly_dark")

    return f"# {total_jumps:,} total jumps in {unique_cities:,} cities across {unique_countries:,} countries", \
           total, total_weekly, avg, avg_weekly, total_map, icicle, jumps_over_time, pop_events, county_map, per_day_plot


with gr.Blocks() as demo:
    with gr.Row():
        total_jumps_label = gr.Markdown("Total Jumps: 0")
    with gr.Row():
        map_fig = gr.Plot(label="Map")
    with gr.Row():
        jumps_over_time = gr.Plot(label="Jumps Over Time")
    with gr.Row():
        total_plot = gr.Plot(label="Top Countries (All Time)")
    with gr.Row():
        total_plot_weekly = gr.Plot(label="Top Countries (This Week)")
    with gr.Row():
        avg_plot = gr.Plot(label="Top Cities (All Time)")
    with gr.Row():
        avg_plot_weekly = gr.Plot(label="Top Cities (This Week)")
    with gr.Row():
        icicle_fig = gr.Plot(label="Treemap")
    with gr.Row():
        per_day_plot = gr.Plot(label="Jumps per Day")
    with gr.Row():
        county_map = gr.Plot(label="US Map")
    with gr.Row():
        popular_events = gr.Plot(label="Popular Events")

    outputs = [total_jumps_label, total_plot, total_plot_weekly, avg_plot, avg_plot_weekly, map_fig, icicle_fig, jumps_over_time, popular_events, county_map, per_day_plot]
    dep = demo.load(full_report, None, outputs)

if __name__ == "__main__":
    demo.launch(share=False)