Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import plotly.io as pio | |
| import streamlit as st | |
| from datetime import datetime | |
| from pprint import pprint | |
| from scipy.stats import bootstrap | |
| # Load data | |
| with open('data.txt', 'r') as f: | |
| cases_data = f.readlines() | |
| monthly_records = [] | |
| annual_records = [] | |
| for case_count in cases_data: | |
| data = case_count.split() | |
| # Annual data | |
| if len(data) == 2: | |
| data[1] = data[1].replace('(', '').replace(')', '') | |
| annual_records.append((int(data[0]), int(data[1]))) | |
| continue | |
| # Monthly data | |
| data[2] = data[2].replace('(', '').replace(')', '') | |
| monthly_records.append((data[0], int(data[1]), int(data[2]))) | |
| pres_records = [ | |
| ('Lyndon B. Johnson', datetime(1963, 11, 22), datetime(1969, 1, 20)), | |
| ('Richard Nixon', datetime(1969, 1, 20), datetime(1974, 8, 9)), | |
| ('Gerald Ford', datetime(1974, 8, 9), datetime(1977, 1, 20)), | |
| ('Jimmy Carter', datetime(1977, 1, 20), datetime(1981, 1, 20)), | |
| ('Ronald Reagan', datetime(1981, 1, 20), datetime(1989, 1, 20)), | |
| ('George H. W. Bush', datetime(1989, 1, 20), datetime(1993, 1, 20)), | |
| ('Bill Clinton', datetime(1993, 1, 20), datetime(2001, 1, 20)), | |
| ('George W. Bush', datetime(2001, 1, 20), datetime(2009, 1, 20)), | |
| ('Barack Obama', datetime(2009, 1, 20), datetime(2017, 1, 20)), | |
| ('Donald Trump', datetime(2017, 1, 20), datetime(2021, 1, 20)), | |
| ('Joe Biden', datetime(2021, 1, 20), datetime(2023, 6, 28)) # cut Biden short so that it lines up with our last data point | |
| ] | |
| pres_df = pd.DataFrame.from_records(pres_records, columns=['name', 'start', 'end']) | |
| # Clean the data | |
| month2int = { | |
| 'January': 1, | |
| 'February': 2, | |
| 'March': 3, | |
| 'April': 4, | |
| 'May': 5, | |
| 'June': 6, | |
| 'July': 7, | |
| 'August': 8, | |
| 'September': 9, | |
| 'October': 10, | |
| 'November': 11, | |
| 'December': 12 | |
| } | |
| mn_df = pd.DataFrame.from_records(monthly_records, columns=['month', 'year', 'cases']) | |
| dts = [] | |
| for i, r in mn_df.iterrows(): | |
| dts.append(datetime(year=r['year'], month=month2int[r['month']], day=28)) | |
| mn_df['date'] = dts | |
| # This is the first year that has more than 1 case | |
| clipped_mn_df = mn_df.query('year >= 1964') | |
| # add 0s for months that are missing | |
| # we cut off at 1964 but Johnson started in November of 1963 | |
| # There weren't any cases in 1963 so it's okay to start | |
| # filling 0s from November of 1963 | |
| cur_yr = 1963 | |
| cur_mn = 11 | |
| new_rows = [] | |
| # pandas `in` is busted so we have to pull out the column manually | |
| # and check against that | |
| existing_dates = clipped_mn_df['date'].to_numpy(dtype=datetime) | |
| # our data goes through the end of the previous month (june 2023) | |
| # we're using 28 as the placeholder "day" for all the months | |
| while cur_yr < 2023 or cur_mn <= 6: | |
| dt = datetime(year=cur_yr, month=cur_mn, day=28) | |
| if dt not in existing_dates: | |
| new_rows.append((dt.strftime('%B'), dt.year, 0, dt)) | |
| if cur_mn == 12: | |
| cur_yr += 1 | |
| cur_mn = 1 | |
| else: | |
| cur_mn += 1 | |
| zero_rows = pd.DataFrame.from_records(new_rows, columns=['month', 'year', 'cases', 'date']) | |
| clipped_mn_df = pd.concat([clipped_mn_df, zero_rows], ignore_index=True) | |
| clipped_mn_df = clipped_mn_df.sort_values(by='date', ascending=False).reset_index(drop=True) | |
| # add the mean & std for each president | |
| presidents = [] | |
| for d in clipped_mn_df['date']: | |
| for i, r in pres_df.iterrows(): | |
| if d >= r['start'] and d <= r['end']: | |
| presidents.append(str(r['name'])) | |
| clipped_mn_df['pres'] = presidents | |
| tmp = clipped_mn_df[['cases', 'pres']].groupby('pres').agg(['mean', 'std']).reset_index(drop=False) | |
| tmp.columns = ['name', 'cases_mean', 'cases_std'] | |
| pres_df = pd.merge(pres_df, tmp, on='name', how='inner') | |
| # bootstrap confidence intervals for the mean | |
| # the data doesn't really look normal enough for 2 std to be super meaningful | |
| pres_names = pres_df['name'].unique() | |
| president_cis = [] | |
| for pres in pres_names: | |
| cases = clipped_mn_df.query(f'pres == "{pres}"')['cases'].to_numpy() | |
| ci = bootstrap( | |
| cases.reshape(1,-1), | |
| np.mean, | |
| vectorized=False, | |
| confidence_level=0.95, | |
| method='BCa' # "bias-corrected and accelerated" (shifts the CI bounds if the distribution is skewed) | |
| ).confidence_interval | |
| president_cis.append((pres, ci.low, ci.high)) | |
| ci_df = pd.DataFrame.from_records(president_cis, columns=['name', 'ci_low', 'ci_high']) | |
| # add the confidence intervals to pres_df | |
| pres_df = pd.merge(pres_df, ci_df, on='name') | |
| # Utils for converting colors | |
| def hex2rgb(h): | |
| """ | |
| '#FF44BB' -> 'rgb(255, 68, 187)' | |
| """ | |
| if h[0] == '#': | |
| h = h[1:] | |
| if len(h) != 6: | |
| raise ValueError(f'malformed hex input') | |
| values = [] | |
| for i in range(0, len(h), 2): | |
| values.append(int(h[i:i+2], base=16)) | |
| return f'rgb({values[0]}, {values[1]}, {values[2]})' | |
| def rgb2rgba(c, a=1.0): | |
| """ | |
| 'rgb(95, 70, 144)' | |
| -> | |
| 'rgba(95, 70, 144)' | |
| -> | |
| 'rgba(95, 70, 144, 1.0) | |
| defaults to 100% opacity | |
| but you can set it | |
| """ | |
| c = c[:3] + 'a' + c[3:] | |
| c = c[:-1] + f', {a})' | |
| return c | |
| # Draw the plot | |
| # streamlit ignores this but streamlit's theme | |
| # is pure white so it's okay I guess? | |
| pio.templates.default = 'plotly_white' | |
| f = go.Figure() | |
| FONT_SIZE = 14 | |
| # add the cases as a bar plot | |
| bar_color = '#bbbbbb' | |
| f.add_trace(go.Bar( | |
| x=clipped_mn_df['date'], | |
| y=clipped_mn_df['cases'], | |
| name='DOJ Antitrust Cases', | |
| marker_color=bar_color, | |
| marker_line_color=bar_color, | |
| hovertemplate='%{x}: <b>%{y}</b><extra></extra>', | |
| hoverlabel={'bgcolor': rgb2rgba(hex2rgb(bar_color), 0.2), 'font': {'size': FONT_SIZE}}, | |
| legendrank=1000 + 1 # default is 1000. Bigger means closer to the top | |
| )) | |
| # add the president means + CI | |
| pres_colors = px.colors.qualitative.Prism | |
| for i, r in pres_df.iterrows(): | |
| # set up colors for this president | |
| pres_color = pres_colors[i] | |
| if pres_color[0] == '#': | |
| pres_color = hex2rgb(pres_color) | |
| ci_color = rgb2rgba(pres_color, 0.5) | |
| hover_color = rgb2rgba(pres_color, 0.2) | |
| hover_str = f"<b>{r['name']}</b><br>Mean: <b>{r['cases_mean']:.2f}</b><br>95% CI: <b>({r['ci_low']:.2f}–{r['ci_high']:.2f})</b><extra></extra>" | |
| hover_label_fmt = {'bgcolor': hover_color, 'font': {'size': FONT_SIZE}} | |
| # add this president's confidence interval | |
| # | |
| # draw two lines like this | |
| # | |
| # o------------------o | |
| # | |
| # o------------------o | |
| # | |
| # make the lines transparent, | |
| # fill in the area between them | |
| upper = r['ci_high'] | |
| lower = r['ci_low'] | |
| f.add_trace(go.Scatter( | |
| x = [r['start'], r['end'], r['end'], r['start']], | |
| y = [upper, upper, lower, lower], | |
| fill='toself', | |
| fillcolor=ci_color, | |
| line_color=rgb2rgba(pres_color, 0), | |
| # I have to set `name` for it to show up when I hover over any part of the fill | |
| # otherwise the hover only comes up when I hover over the corners where the points are | |
| # but `name` doesn't do the <extra></extra> thing to remove the extra hover box | |
| name=hover_str.replace('<extra></extra>',''), | |
| showlegend=False, | |
| hovertemplate=hover_str, | |
| hoverlabel=hover_label_fmt | |
| )) | |
| # add this president's mean | |
| f.add_trace(go.Scatter( | |
| x=[r['start'], r['end']], | |
| y=[r['cases_mean'],r['cases_mean']], | |
| name=r['name'], | |
| line_color=pres_color, | |
| # I used to have vertical bars at the ends of the mean line | |
| # but I like it more without them | |
| # so just set the width to 0 | |
| marker={'symbol': 'line-ns', 'line': {'width': 0, 'color':pres_color}}, | |
| hovertemplate=hover_str, | |
| hoverlabel=hover_label_fmt | |
| )) | |
| # Trim the top of the plot a bit because there are a few outliers | |
| # that make it hard to see the president aggregations | |
| MAX_HEIGHT = 16 | |
| f.update_yaxes(range=[0, MAX_HEIGHT]) | |
| # add hashing over any bars taller than MAX_HEIGHT | |
| # since we're cutting them off | |
| too_tall = clipped_mn_df[clipped_mn_df['cases'] > MAX_HEIGHT]['date'] | |
| f.add_trace(go.Bar( | |
| x=too_tall, | |
| y=[MAX_HEIGHT * 0.25] * len(too_tall), | |
| base = [MAX_HEIGHT - MAX_HEIGHT*0.1] * len(too_tall), | |
| marker_color='#fff', | |
| marker_line_color='rgba(255,255,255,0)', | |
| marker_line_width=0, | |
| # I think I remember plotly uses milliseconds if the axis is a datetime | |
| # so the width has to be huge to cover a whole month | |
| # yep 1 month is 2.6 * 10**9 milliseconds | |
| width=3e9, | |
| # these are the options ['', '/', '\\', 'x', '-', '|', '+', '.'] | |
| marker_pattern_shape='-', | |
| marker_pattern_fillmode='replace', | |
| showlegend=False | |
| )) | |
| f.update_layout(barmode='stack') | |
| f.update_layout(title="<b>What does the DOJ's Antitrust Division look like?</b>") | |
| # since streamlit doesn't respect the Plotly theme, | |
| # we can instead make the background transparent | |
| f.update_layout({ | |
| 'plot_bgcolor': 'rgba(0, 0, 0, 0)', | |
| 'paper_bgcolor': 'rgba(0, 0, 0, 0)', | |
| }) | |
| st.set_page_config(layout='wide') | |
| st.plotly_chart(f, use_container_width=True, theme=None) | |
| col1, col2, col3 = st.columns(3) | |
| col2.markdown(""" | |
| # The data | |
| To get the data, I went to [the website for the Antitrust Division of the U.S. Department of Justice](https://www.justice.gov/atr/antitrust-case-filings), clicked "Filter by Case Open Date" in the left menu, and clicked "Show more." That gave me a pretty clean list that I could highlight and copy. | |
| ``` | |
| June 2023 (2) | |
| April 2023 (1) | |
| March 2023 (1) | |
| February 2023 (2) | |
| January 2023 (5) | |
| 2023 (11) | |
| November 2022 (4) | |
| [ . . . ] | |
| ``` | |
| There are some obvious problems with this data. | |
| For example, I found [a Wikipedia article about U.S. antitrust law](https://en.wikipedia.org/wiki/United_States_antitrust_law). | |
| That page refers to a case that happened in 1943, but my data doesn't have any cases in 1943. | |
| I looked up the case (American Medical Association v. United States, 317 U.S. 519 (1943)) to see what the deal was. | |
| I think that case, like the current case against Meta, was filed by the FTC—not the DOJ. | |
| So this data is definitely not a complete record of U.S. antitrust cases. | |
| There's also at least one typo in this random menu on the DOJ's website. | |
| For the annual count of all cases opened in 2022, they list the correct amount but they label it "2026" instead. | |
| I didn't notice any other typos. | |
| I'm sure there are a few I missed. | |
| That being said, this data is more than good enough for my goal. | |
| I just want to vaguely describe the trend of antitrust cases with a pretty plot. | |
| """) | |