Spaces:

dylanplummer
/

NextJump-analytics

Running

App Files Files Community

NextJump-analytics / app.py

dylanplummer

Update app.py

6ffd3ed verified 3 months ago

raw

history blame contribute delete

17.6 kB

	from google.analytics.data_v1beta import BetaAnalyticsDataClient
	from google.analytics.data_v1beta.types import (
	DateRange,
	Dimension,
	Metric,
	RunReportRequest,
	RunRealtimeReportRequest
	)

	import gradio as gr
	import os
	import json
	import time
	import math
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import plotly.express as px

	FINISHED_EXERCISE = 'finished_exercise'
	PROPERTY_ID = "384068977"

	try:
	credentials_json = os.environ['GOOGLE_APPLICATION_CREDENTIALS_JSON']
	credentials_dict = json.loads(credentials_json)
	# write json to file
	with open('credentials.json', 'w') as f:
	json.dump(credentials_dict, f)
	# set env var to filename
	os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join(os.path.dirname(__file__), 'credentials.json')
	except KeyError: # running locally
	pass
	except Exception as e:
	print(f"Error loading credentials: {e}")
	pass

	iso = pd.read_csv('iso.tsv', sep='\t')
	iso['Alpha-2 code'] = iso['Alpha-2 code'].str.strip()
	iso['Alpha-3 code'] = iso['Alpha-3 code'].str.strip()
	iso.set_index('Alpha-2 code', inplace=True)
	alpha_2_map = iso['Alpha-3 code'].to_dict()

	# read counties json file
	with open('counties.json') as f:
	counties = json.load(f)

	ga_cities = pd.read_csv('cities.csv')
	cities = pd.read_csv('uscities.csv')
	cities['full_city'] = cities['city'] + ', ' + cities['state_name']
	cities.set_index('full_city', inplace=True)
	ga_cities['Criteria ID'] = ga_cities['Criteria ID'].astype(str)
	ga_cities.set_index('Criteria ID', inplace=True)
	ga_city_map = ga_cities['Name'].to_dict()
	ga_cities['state'] = ga_cities['Canonical Name'].str.split(',').str[1].str.strip()
	ga_state_map = ga_cities['state'].to_dict()
	city_county_map = cities['county_fips'].to_dict()
	city_county_name_map = cities['county_name'].to_dict()

	cached_report = None
	cache_time = 0
	reload_cache = False
	# 6 hours
	reload_every = 6 * 60 * 60

	def mpl_to_plotly(cmap, pl_entries=11, rdigits=2):
	# cmap - colormap
	# pl_entries - int = number of Plotly colorscale entries
	# rdigits - int -=number of digits for rounding scale values
	scale = np.linspace(0, 1, pl_entries)
	colors = (cmap(scale)[:, :3]*255).astype(np.uint8)
	pl_colorscale = [[round(s, rdigits), f'rgb{tuple(color)}'] for s, color in zip(scale, colors)]
	return pl_colorscale

	def full_report():
	global cached_report, cache_time, reload_cache
	if time.time() - cache_time > reload_every:
	reload_cache = False
	if not reload_cache:
	print("Loading report...")
	reload_cache = True
	cache_time = time.time()
	client = BetaAnalyticsDataClient()

	# first request all data where we have the exercise name
	request = RunReportRequest(
	property=f"properties/{PROPERTY_ID}",
	dimensions=[Dimension(name="nthDay"),
	Dimension(name='eventName'),
	Dimension(name="continent"),
	Dimension(name="country"),
	Dimension(name="countryId"),
	Dimension(name="cityId"),
	Dimension(name="customEvent:exercise")],
	metrics=[Metric(name="eventValue")],
	#return_property_quota=True,
	date_ranges=[DateRange(start_date="2023-06-30", end_date="today")],
	)
	response = client.run_report(request)

	res = {'day': [], 'jumps': [], 'continent': [], 'country': [], 'iso': [], 'cityId': [], 'exercise': []}

	for row in response.rows:
	event_name = row.dimension_values[1].value
	if event_name == FINISHED_EXERCISE:
	day = int(row.dimension_values[0].value)
	continent = row.dimension_values[2].value
	country = row.dimension_values[3].value
	country_iso = row.dimension_values[4].value
	city = row.dimension_values[5].value
	exercise = row.dimension_values[6].value
	event_value = float(row.metric_values[0].value)
	res['day'].append(day)
	res['jumps'].append(event_value)
	res['continent'].append(continent)
	res['country'].append(country)
	res['iso'].append(country_iso)
	res['cityId'].append(city)
	res['exercise'].append(exercise)

	df = pd.DataFrame.from_dict(res)
	# then find the earliest day we started getting exercise name data
	first_day = int(df['day'].min())
	end_date = pd.to_datetime('2023-06-30') + pd.DateOffset(days=first_day)
	# only need YYY-MM-DD
	end_date = str(end_date.strftime('%Y-%m-%d'))
	# then request all data where we don't have the exercise name
	request = RunReportRequest(
	property=f"properties/{PROPERTY_ID}",
	dimensions=[Dimension(name="nthDay"),
	Dimension(name='eventName'),
	Dimension(name="continent"),
	Dimension(name="country"),
	Dimension(name="countryId"),
	Dimension(name="cityId")],
	metrics=[Metric(name="eventValue")],
	#return_property_quota=True,
	date_ranges=[DateRange(start_date="2023-06-30", end_date=end_date)],
	)
	response = client.run_report(request)
	res = {'day': [], 'jumps': [], 'continent': [], 'country': [], 'iso': [], 'cityId': [], 'exercise': []}
	for row in response.rows:
	event_name = row.dimension_values[1].value
	if event_name == FINISHED_EXERCISE:
	day = int(row.dimension_values[0].value)
	continent = row.dimension_values[2].value
	country = row.dimension_values[3].value
	country_iso = row.dimension_values[4].value
	city = row.dimension_values[5].value
	event_value = float(row.metric_values[0].value)
	res['day'].append(day)
	res['jumps'].append(event_value)
	res['continent'].append(continent)
	res['country'].append(country)
	res['iso'].append(country_iso)
	res['cityId'].append(city)
	res['exercise'].append('n/a')
	new_df = pd.DataFrame.from_dict(res)
	# drop any rows we already have
	#new_df = new_df[new_df['day'] < first_day]
	df = pd.concat([df, new_df]).reset_index(drop=True)
	df['duration'] = df['exercise'].apply(lambda x: 0 if x in ['n/a', '(not set)'] else int(x[2:]))
	print(df['duration'].sum())
	cached_report = df.copy(deep=True)
	else:
	print("Using cached report...")
	df = cached_report.copy(deep=True)

	total_jumps = int(df['jumps'].sum())
	unique_countries = df['country'].nunique()
	unique_cities = df['cityId'].nunique()

	print(f"Total jumps: {total_jumps}, unique countries: {unique_countries}, unique cities: {unique_cities}")
	df['iso'] = df['iso'].map(alpha_2_map)
	df['jumps'] = df['jumps'].astype(int)
	df['city'] = df['cityId'].map(ga_city_map)
	df['state'] = df['cityId'].map(ga_state_map)
	df['city'] = df.apply(lambda row: row['city'] if row['country'] != 'Bermuda' else 'Hamilton', axis=1)
	df['city'] = df['city'] + ', ' + df['state']

	country_df = df.groupby(['country', 'iso']).sum().reset_index()
	country_df = country_df.sort_values(by=['jumps'], ascending=False)
	top_10_countries = country_df.iloc[:15]['country'].tolist()

	country_df_to_plot = df.groupby(['country', 'iso']).sum().reset_index()
	country_df_to_plot = country_df_to_plot[country_df_to_plot['country'].isin(top_10_countries)].reset_index(drop=True)
	country_df_to_plot = country_df_to_plot.sort_values(by=['jumps'], ascending=True)
	df['rank'] = df['jumps'].rank(ascending=False)
	df['world'] = 'Earth'

	exercise_df = df[~df['exercise'].isin(['n/a', '(not set)'])]
	# plot a bar graph of the most popular exercises and their counts in the dataset
	top_6_events = exercise_df['exercise'].value_counts().reset_index()[:6]
	pop_events = px.bar(top_6_events,
	y=top_6_events.index,
	x='exercise',
	color=top_6_events.index,
	title='Most Popular Exercises',
	template="plotly_dark")
	pop_events.update_layout(showlegend=False)

	total = px.bar(country_df_to_plot,
	y='country', x='jumps',
	color='country',
	title='Total Jumps by Country',
	orientation='h',
	category_orders={'country': top_10_countries},
	height=800,
	template="plotly_dark")
	total.update_layout(showlegend=False)

	country_df_to_plot_weekly = df[df['day'] >= df['day'].max() - 7].groupby(['country', 'iso']).sum().reset_index()
	country_df_to_plot_weekly = country_df_to_plot_weekly.sort_values(by=['jumps'], ascending=False)
	top_5_weekly = country_df_to_plot_weekly.iloc[:10]['country'].tolist()
	country_df_to_plot_weekly = country_df_to_plot_weekly[country_df_to_plot_weekly['country'].isin(top_5_weekly)].reset_index(drop=True)
	country_df_to_plot_weekly = country_df_to_plot_weekly.sort_values(by=['jumps'], ascending=True)
	total_weekly = px.bar(country_df_to_plot_weekly,
	y='country', x='jumps',
	color='country',
	title='Top Countries This Week',
	orientation='h',
	category_orders={'country': top_5_weekly},
	height=500,
	template="plotly_dark")
	total_weekly.update_layout(showlegend=False)

	city_df = df.groupby(['city', 'iso']).sum().reset_index()
	city_df = city_df.sort_values(by=['jumps'], ascending=False)
	city_df['city'] = city_df.apply(lambda row: row['city'] + ', ' + row['iso'], axis=1)
	top_10_cities = city_df.iloc[:15]['city'].tolist()

	icicle_df = df.groupby(['world', 'continent', 'country', 'state', 'city']).sum().reset_index()
	#icicle_df['log10_jumps'] = icicle_df['jumps'].apply(lambda x: math.log10(x) if x > 0 else 0)

	# icicle = px.icicle(icicle_df, path=['world', 'continent', 'country', 'city'],
	# values='jumps',
	# title='Jumps by Continent/Country',
	# template="plotly_dark",
	# color_continuous_scale='OrRd',
	# maxdepth=7,
	# branchvalues='remainder',
	# color='jumps')

	print(df.columns)
	nipy_spec = mpl_to_plotly(plt.cm.nipy_spectral, pl_entries=15)
	icicle = px.treemap(icicle_df, path=['world', 'continent', 'country', 'state', 'city'],
	values='jumps',
	title='Jumps by Continent/Country/City (click anywhere!)',
	template="plotly_dark",
	color_continuous_scale='jet',
	range_color=[0, np.quantile(icicle_df['jumps'].values, q=0.99)],
	branchvalues='total',
	height=800,
	maxdepth=4,
	color='jumps')

	city_df = df.groupby(['city', 'iso']).sum().reset_index()
	city_df = city_df[city_df['city'] != '(not set)']
	city_df['city'] = city_df.apply(lambda row: row['city'] + ', ' + row['iso'], axis=1)
	city_df = city_df[city_df['city'].isin(top_10_cities)].reset_index(drop=True)
	city_df = city_df.sort_values(by=['jumps'], ascending=True)

	avg = px.bar(city_df,
	y='city', x='jumps', color='city',
	title='Total Jumps by City',
	orientation='h',
	category_orders={'city': top_10_cities},
	height=800,
	template="plotly_dark")

	city_df_weekly = df[df['day'] >= df['day'].max() - 7].groupby(['city', 'iso']).sum().reset_index()
	city_df_weekly = city_df_weekly[city_df_weekly['city'] != '(not set)']
	city_df_weekly['city'] = city_df_weekly.apply(lambda row: row['city'] + ', ' + row['iso'], axis=1)
	city_df_weekly = city_df_weekly.sort_values(by=['jumps'], ascending=False)
	top_5_weekly = city_df_weekly.iloc[:10]['city'].tolist()
	city_df_weekly = city_df_weekly[city_df_weekly['city'].isin(top_5_weekly)].reset_index(drop=True)
	city_df_weekly = city_df_weekly.sort_values(by=['jumps'], ascending=True)
	avg_weekly = px.bar(city_df_weekly,
	y='city', x='jumps', color='city',
	title='Top Cities This Week',
	orientation='h',
	category_orders={'city': top_5_weekly},
	height=500,
	template="plotly_dark")

	avg.update_layout(showlegend=False)
	avg.update(layout_coloraxis_showscale=False)
	avg_weekly.update_layout(showlegend=False)
	avg_weekly.update(layout_coloraxis_showscale=False)

	country_df['rank'] = country_df['jumps'].rank(ascending=False)
	total_map = px.choropleth(country_df, locations="iso",
	color="rank",
	hover_name="country", # column to add to hover information
	hover_data=["jumps"],
	color_continuous_scale ="OrRd_r",
	projection='natural earth',
	template="plotly_dark")
	# remove the legend
	total_map.update_layout(showlegend=False)
	total_map.update(layout_coloraxis_showscale=False)

	county_df = df.copy()
	county_df['county'] = county_df['city'].map(city_county_map)
	county_df['count_name'] = county_df['city'].map(city_county_name_map)
	county_df = county_df.groupby(['county', 'count_name']).sum().reset_index()
	county_df['rank'] = county_df['jumps'].rank(ascending=False)
	county_df['county'] = county_df['county'].astype(int)
	county_df['county'] = county_df['county'].astype(str).str.zfill(5) # county codes are two digits for state, three for county
	county_map = px.choropleth(county_df, geojson=counties, locations='county', color='rank',
	color_continuous_scale="OrRd_r",
	scope="usa",
	hover_name="count_name",
	hover_data=["jumps"],
	template="plotly_dark"
	)
	county_map.update_layout(showlegend=False)
	county_map.update(layout_coloraxis_showscale=False)

	df = df.groupby(['day', 'continent']).sum().reset_index()
	df = df.sort_values(by=['day'])
	df['total_jumps'] = df.groupby('continent')['jumps'].cumsum()
	# fill in any missing days with current max value
	for day in range(1, int(df['day'].max()) + 1):
	for continent in df['continent'].unique():
	if not df[(df['day'] == day) & (df['continent'] == continent)].any().any():
	max_jumps = df[(df['day'] < day) & (df['continent'] == continent)]['total_jumps'].max()
	df = pd.concat([df, pd.DataFrame([[day, continent, max_jumps]], columns=['day', 'continent', 'total_jumps'])])
	#df = df.append({'day': day, 'continent': continent, 'total_jumps': max_jumps}, ignore_index=True)
	df = df.sort_values(by=['day']).reset_index(drop=True)
	jumps_over_time = px.area(df, x='day',
	y='total_jumps',
	color='continent',
	template="plotly_dark")
	df.fillna(0, inplace=True)
	daily_df = df.groupby(['day'])[['jumps']].sum().reset_index()
	per_day_plot = px.scatter(daily_df, x='day', y='jumps',
	trendline='rolling',
	trendline_options=dict(window=14),
	trendline_color_override="goldenrod",
	trendline_scope='overall',
	template="plotly_dark")

	return f"# {total_jumps:,} total jumps in {unique_cities:,} cities across {unique_countries:,} countries", \
	total, total_weekly, avg, avg_weekly, total_map, icicle, jumps_over_time, pop_events, county_map, per_day_plot


	with gr.Blocks() as demo:
	with gr.Row():
	total_jumps_label = gr.Markdown("Total Jumps: 0")
	with gr.Row():
	map_fig = gr.Plot(label="Map")
	with gr.Row():
	jumps_over_time = gr.Plot(label="Jumps Over Time")
	with gr.Row():
	total_plot = gr.Plot(label="Top Countries (All Time)")
	with gr.Row():
	total_plot_weekly = gr.Plot(label="Top Countries (This Week)")
	with gr.Row():
	avg_plot = gr.Plot(label="Top Cities (All Time)")
	with gr.Row():
	avg_plot_weekly = gr.Plot(label="Top Cities (This Week)")
	with gr.Row():
	icicle_fig = gr.Plot(label="Treemap")
	with gr.Row():
	per_day_plot = gr.Plot(label="Jumps per Day")
	with gr.Row():
	county_map = gr.Plot(label="US Map")
	with gr.Row():
	popular_events = gr.Plot(label="Popular Events")

	outputs = [total_jumps_label, total_plot, total_plot_weekly, avg_plot, avg_plot_weekly, map_fig, icicle_fig, jumps_over_time, popular_events, county_map, per_day_plot]
	dep = demo.load(full_report, None, outputs)

	if __name__ == "__main__":
	demo.launch(share=False)