import polars as pl import datetime from shiny import App, reactive, ui from pytabulator import TableOptions, Tabulator, output_tabulator, render_tabulator, theme import api_scraper from stuff_model import feature_engineering as fe, stuff_apply theme.tabulator_site() # ============================================================================= # CONSTANTS & CONFIGURATION # ============================================================================= # Custom CSS for loading spinner CUSTOM_CSS = """ """ # Custom JavaScript for busy state handling CUSTOM_JS = """ """ # Initialize scrapers scraper = api_scraper.MLB_Scrape() df_year_old_group = pl.read_parquet('pitch_data_agg_2024.parquet') pitcher_old_dict = dict(zip(df_year_old_group['pitcher_id'], df_year_old_group['pitcher_name'])) # Column definitions for difference calculations COLS_TO_SUBTRACT = [ ("start_speed", "start_speed_old"), ("max_start_speed", "max_start_speed_old"), ("ivb", "ivb_old"), ("hb", "hb_old"), ("release_pos_z", "release_pos_z_old"), ("release_pos_x", "release_pos_x_old"), ("extension", "extension_old"), ("tj_stuff_plus", "tj_stuff_plus_old") ] COLS_TO_SUBTRACT_PERCENT = [ ("pitch_percent", "pitch_percent_old"), ("rhh_percent", "rhh_percent_old"), ("lhh_percent", "lhh_percent_old"), ] PITCH_TYPES = ['CH', 'CU', 'FC', 'FF', 'FS', 'SI', 'SL', 'ST', 'All'] # Standard aggregation expressions (reusable across functions) PITCH_AGG_EXPRS = [ pl.col('game_date').max().alias('last_pitched'), pl.col('start_speed').count().alias('count'), pl.col('start_speed').mean().alias('start_speed'), pl.col('start_speed').max().alias('max_start_speed'), pl.col('ivb').mean().alias('ivb'), pl.col('hb').mean().alias('hb'), pl.col('release_pos_z').mean().alias('release_pos_z'), pl.col('release_pos_x').mean().alias('release_pos_x'), pl.col('extension').mean().alias('extension'), pl.col('tj_stuff_plus').mean().alias('tj_stuff_plus'), pl.col("batter_hand").eq("R").sum().alias("rhh_count"), pl.col("batter_hand").eq("L").sum().alias("lhh_count") ] # ============================================================================= # COLUMN DEFINITIONS # ============================================================================= def get_all_columns(): return [ {"title": "ID", "field": "pitcher_id", "width": 80, "headerFilter": "input", "frozen": True}, {"title": "Pitcher Name", "field": "pitcher_name", "width": 200, "headerFilter": "input", "frozen": True}, {"title": "Team", "field": "pitcher_team", "width": 70, "headerFilter": "input", "frozen": True}, {"title": "Last Pitched", "field": "last_pitched", "width": 110, "headerFilter": "input", "frozen": True}, {"title": "Pitch Type", "field": "pitch_type", "width": 90, "headerFilter": "input", "frozen": True}, {"title": "New?", "field": "new_pitch", "width": 60, "headerFilter": "input", "frozen": False}, {"title": "Pitches", "field": "count", "width": 75}, {"title": "Pitch%", "field": "pitch_percent_formatted", "width": 85, "formatter": "textarea"}, {"title": "LHH%", "field": "lhh_percent_formatted", "width": 85, "formatter": "textarea"}, {"title": "RHH%", "field": "rhh_percent_formatted", "width": 85, "formatter": "textarea"}, {"title": "Velocity", "field": "start_speed_formatted", "width": 85, "formatter": "textarea"}, {"title": "Max Velo", "field": "max_start_speed_formatted", "width": 85, "formatter": "textarea"}, {"title": "iVB", "field": "ivb_formatted", "width": 85, "formatter": "textarea"}, {"title": "HB", "field": "hb_formatted", "width": 85, "formatter": "textarea"}, {"title": "RelH", "field": "release_pos_z_formatted", "width": 85, "formatter": "textarea"}, {"title": "RelS", "field": "release_pos_x_formatted", "width": 85, "formatter": "textarea"}, {"title": "Extension", "field": "extension_formatted", "width": 90, "formatter": "textarea"}, {"title": "tjStuff+", "field": "tj_stuff_plus_formatted", "width": 90, "formatter": "textarea"} ] def get_daily_columns(): """Get columns for daily table (replaces 'Last Pitched' with 'Date').""" cols = get_all_columns() cols[3] = {"title": "Date", "field": "game_date", "width": 100, "headerFilter": "input", "frozen": True} return cols def get_tjstuff_columns(): """Get columns for tjStuff+ table with 2024 comparison.""" return get_all_columns() + [ {"title": "2024 tjStuff+", "field": "tj_stuff_plus_old", "width": 100, "formatter": "textarea"}, {"title": "Δ", "field": "tj_stuff_plus_diff", "width": 70, "formatter": "textarea"} ] def get_summary_columns(): """Get columns for summary table.""" return [ {"title": "ID", "field": "pitcher_id", "width": 80, "headerFilter": "input", "frozen": True}, {"title": "Pitcher Name", "field": "pitcher_name", "width": 200, "headerFilter": "input", "frozen": True}, {"title": "Team", "field": "pitcher_team", "width": 70, "headerFilter": "input", "frozen": True}, {"title": "Pitches", "field": "count", "width": 80, "headerFilter": "input"}, ] + [{"title": col, "field": col, "width": 70, "formatter": "textarea"} for col in PITCH_TYPES] def get_team_columns(): """Get columns for team table.""" return [ {"title": "Team", "field": "pitcher_team", "width": 150, "headerFilter": "input", "frozen": True}, {"title": "Pitches", "field": "count", "width": 120, "headerFilter": "input"}, {"title": "tjStuff+", "field": "tj_stuff_plus", "width": 100, "formatter": "textarea"}, ] # ============================================================================= # DATA PROCESSING HELPERS # ============================================================================= def compute_pitcher_totals(df, group_cols=None): """Compute total pitches for each pitcher.""" group_cols = group_cols or ["pitcher_id"] return df.group_by(group_cols).agg(pl.len().alias("pitcher_total")) def compute_hand_totals(df, group_cols=None): """Compute totals by batter hand.""" group_cols = group_cols or ["pitcher_id"] return ( df.group_by(group_cols + ["batter_hand"]) .agg(pl.len().alias("pitcher_total")) .pivot(values="pitcher_total", index=group_cols, on="batter_hand", aggregate_function="sum") .rename({"L": "pitcher_total_left", "R": "pitcher_total_right"}) .fill_null(0) ) def aggregate_pitch_data(df, group_cols): """Aggregate pitch data by specified grouping columns.""" return df.group_by(group_cols).agg(PITCH_AGG_EXPRS) def add_pitch_percentages(df): """Add pitch percentage columns.""" return df.with_columns([ (pl.col("count") / pl.col("pitcher_total")).alias("pitch_percent"), (pl.col("rhh_count") / pl.col("pitcher_total_right")).alias("rhh_percent"), (pl.col("lhh_count") / pl.col("pitcher_total_left")).alias("lhh_percent") ]) def mark_new_pitches(df, old_df, speed_col="start_speed_old"): """Mark new pitches that don't exist in old data.""" return df.with_columns( pl.when( pl.col(speed_col).is_null() & pl.col('pitcher_id').is_in(old_df['pitcher_id']) ).then(pl.lit(True)) .otherwise(pl.lit(None)) .alias("new_pitch") ) def format_diff_value(new_col: str, diff_col: str) -> pl.Expr: """Format a numeric column with its diff in parentheses using native Polars.""" return ( pl.when(pl.col(diff_col).eq(10000)) .then(pl.col(new_col).round(1).cast(pl.Utf8) + "\n\t") .otherwise( pl.col(new_col).round(1).cast(pl.Utf8) + "\n(" + pl.when(pl.col(diff_col) >= 0) .then(pl.lit("+") + pl.col(diff_col).round(1).cast(pl.Utf8)) .otherwise(pl.col(diff_col).round(1).cast(pl.Utf8)) + ")" ) ) def format_percent_diff_value(new_col: str, diff_col: str) -> pl.Expr: """Format a percent column with its diff in parentheses using native Polars.""" new_pct = (pl.col(new_col) * 100).round(1) diff_pct = (pl.col(diff_col) * 100).round(1) return ( pl.when(pl.col(diff_col).eq(10000)) .then( new_pct.cast(pl.Utf8) + "%\n(" + pl.when(new_pct >= 0).then(pl.lit("+")).otherwise(pl.lit("")) + new_pct.cast(pl.Utf8) + "%)" ) .otherwise( new_pct.cast(pl.Utf8) + "%\n(" + pl.when(diff_pct >= 0).then(pl.lit("+")).otherwise(pl.lit("")) + diff_pct.cast(pl.Utf8) + "%)" ) ) def format_diff_columns(df, cols_to_subtract): """Create diff columns and format them using native Polars expressions.""" # Calculate diffs df = df.with_columns([ pl.when(pl.col(old).is_null()) .then(pl.lit(10000.0)) .otherwise(pl.col(new) - pl.col(old)) .alias(new + "_diff") for new, old in cols_to_subtract ]) # Format with diffs return df.with_columns([ format_diff_value(new, new + "_diff").alias(new + "_formatted") for new, _ in cols_to_subtract ]) def format_percent_diff_columns(df, cols_to_subtract): """Create percent diff columns and format them using native Polars expressions.""" # Calculate diffs df = df.with_columns([ pl.when(pl.col(old).is_null()) .then(pl.lit(10000.0)) .otherwise(pl.col(new) - pl.col(old)) .alias(new + "_diff") for new, old in cols_to_subtract ]) # Format with diffs return df.with_columns([ format_percent_diff_value(new, new + "_diff").alias(new + "_formatted") for new, _ in cols_to_subtract ]) def add_team_column(df, spring_df): """Add team column to dataframe.""" team_dict = dict(zip(spring_df['pitcher_id'], spring_df['pitcher_team'])) df['pitcher_team'] = df['pitcher_id'].map(team_dict) return df def process_and_aggregate(df_stuff, group_cols, join_cols=None): """Standard pipeline: aggregate, join totals, add percentages.""" join_cols = join_cols or ["pitcher_id"] df_totals = compute_pitcher_totals(df_stuff, join_cols) df_hand_totals = compute_hand_totals(df_stuff, join_cols) df_group = aggregate_pitch_data(df_stuff, group_cols) df_group = df_group.join(df_totals, on=join_cols, how="left") df_group = df_group.join(df_hand_totals, on=join_cols, how="left") return add_pitch_percentages(df_group) def merge_and_format(df_group, old_df, cols_sub=None, cols_pct=None, suffix="_old"): """Merge with old data and apply formatting.""" cols_sub = cols_sub or COLS_TO_SUBTRACT cols_pct = cols_pct or COLS_TO_SUBTRACT_PERCENT df_merge = df_group.join(old_df, on=['pitcher_id', 'pitch_type'], how='left', suffix=suffix) df_merge = mark_new_pitches(df_merge, old_df, f"start_speed{suffix}") df_merge = format_diff_columns(df_merge, cols_sub) df_merge = format_percent_diff_columns(df_merge, cols_pct) return df_merge # ============================================================================= # UI DEFINITION # ============================================================================= app_ui = ui.page_fluid( # Inject custom CSS and JS ui.head_content(ui.HTML(CUSTOM_CSS), ui.HTML(CUSTOM_JS)), # Loading overlay ui.div( ui.div(class_="spinner"), ui.div("Loading data...", class_="loading-text"), class_="loading-overlay", id="loading-overlay" ), ui.card( ui.card_header("2025 MLB Pitch Data App"), ui.row( ui.column(4, ui.markdown("""This app generates a table which shows the 2025 MLB Pitch data. * Differences are calculated based on 2024 regular season data * If 2024 data does not exist for pitcher, 2023 Data is used * If no difference exists, the pitch is labelled as a new pitch"""), ui.input_action_button("refresh", "Refresh Data", class_="btn-primary", width="100%") ), ui.column(3, ui.div("By: ", ui.tags.a("@TJStats", href="https://x.com/TJStats", target="_blank")), ui.tags.p("Data: MLB"), ui.tags.p(ui.tags.a("Support me on Patreon for more baseball content", href="https://www.patreon.com/TJ_Stats", target="_blank")) ) ), ui.navset_tab( ui.nav_panel("All Pitches", ui.row( ui.column(1, ui.download_button("download_all", "Download Data", class_="btn-sm mb-3")), ui.column(2, ui.div({"class": "input-group"}, ui.span("Pitches >=", class_="input-label"), ui.input_numeric(id='pitches_all_min', label='', value=1, min=1, width="100px"))) ), output_tabulator("table_all") ), ui.nav_panel("Last Game to Season", ui.row( ui.column(2, ui.div({"class": "input-group"}, ui.span("Pitches >=", class_="input-label"), ui.input_numeric(id='pitches_all_compare_min', label='', value=1, min=1, width="100px"))) ), output_tabulator("table_all_compare") ), ui.nav_panel("Daily Pitches", ui.row( ui.column(1, ui.download_button("download_daily", "Download Data", class_="btn-sm mb-3")), ui.column(2, ui.div({"class": "input-group"}, ui.span("Pitches >=", class_="input-label"), ui.input_numeric(id='pitches_daily_min', label='', value=1, min=1, width="100px"))) ), output_tabulator("table_daily") ), ui.nav_panel("tjStuff+", ui.row( ui.column(2, ui.div({"class": "input-group"}, ui.span("Pitches >=", class_="input-label"), ui.input_numeric(id='pitches_tjstuff_min', label='', value=1, min=1, width="100px"))) ), output_tabulator("table_tjstuff") ), ui.nav_panel("tjStuff+ Summary", ui.row( ui.column(1, ui.download_button("download_tjsumm", "Download Data", class_="btn-sm mb-3")), ui.column(2, ui.div({"class": "input-group"}, ui.span("Pitches >=", class_="input-label"), ui.input_numeric(id='pitches_tjsumm_min', label='', value=1, min=1, width="100px"))) ), output_tabulator("table_stuff_all") ), ui.nav_panel("tjStuff+ Team", ui.row(ui.column(2)), output_tabulator("table_tjstuff_team") ) ) ) ) def server(input, output, session): # ========================================================================= # CORE DATA LOADING (Cached) # ========================================================================= @reactive.Calc def spring_data(): """Load raw pitch data from parquet and fetch today's games.""" df_spring = pl.read_parquet( "https://huggingface.co/datasets/TJStatsApps/mlb_data/resolve/main/data/mlb_pitch_data_2025.parquet" ) date = (datetime.datetime.now() - datetime.timedelta(hours=8)).date() game_list_input = ( scraper.get_schedule(year_input=[date.year], sport_id=[1], game_type=['R']) .filter(pl.col('date') == date)['game_id'] ) data = scraper.get_data(game_list_input) df = scraper.get_data_df(data) df_spring = pl.concat([df_spring, df]).unique(subset=['play_id']).sort('game_date', descending=True) return df_spring.filter(pl.col('start_speed') > 0) @reactive.Calc def stuff_data(): """Apply feature engineering and stuff model - cached to avoid recomputation.""" df_spring = spring_data().unique(subset=['play_id']) return stuff_apply.stuff_apply(fe.feature_engineering(df_spring)) # ========================================================================= # PRE-COMPUTED AGGREGATIONS (Cached) # ========================================================================= @reactive.Calc def ts_data(): """Season-level aggregation for download.""" df_stuff = stuff_data() df_group = process_and_aggregate(df_stuff, ['pitcher_id', 'pitcher_name', 'pitch_type']) df_merge = df_group.join(df_year_old_group, on=['pitcher_id', 'pitch_type'], how='left', suffix='_old') df_merge = mark_new_pitches(df_merge, df_year_old_group) return df_merge.select([ 'pitcher_id', 'pitcher_name', 'pitch_type', 'count', 'pitch_percent', 'rhh_percent', 'lhh_percent', 'start_speed', 'max_start_speed', 'ivb', 'hb', 'release_pos_z', 'release_pos_x', 'extension', 'tj_stuff_plus' ]) @reactive.Calc def ts_data_daily(): """Daily-level aggregation for download.""" df_stuff = stuff_data() join_cols = ["pitcher_id", 'game_id'] group_cols = ['pitcher_id', 'pitcher_name', 'pitch_type', 'game_id', 'game_date'] df_group = process_and_aggregate(df_stuff, group_cols, join_cols) df_merge = df_group.join(df_year_old_group, on=['pitcher_id', 'pitch_type'], how='left', suffix='_old') df_merge = mark_new_pitches(df_merge, df_year_old_group) return df_merge.select([ 'pitcher_id', 'pitcher_name', 'game_date', 'pitch_type', 'count', 'pitch_percent', 'rhh_percent', 'lhh_percent', 'start_speed', 'max_start_speed', 'ivb', 'hb', 'release_pos_z', 'release_pos_x', 'extension', 'tj_stuff_plus' ]) @reactive.Calc def ts_data_summ(): """tjStuff+ summary pivot table.""" df_stuff = stuff_data() df_agg = df_stuff.group_by(['pitcher_id', 'pitcher_name', 'pitch_type']).agg( pl.len().alias('count'), pl.col('tj_stuff_plus').mean() ) # Calculate weighted average for "All" pitch type df_weighted_avg = ( df_agg.with_columns((pl.col('tj_stuff_plus') * pl.col('count')).alias('weighted')) .group_by(['pitcher_id', 'pitcher_name']) .agg( pl.col('count').sum().alias('total_count'), pl.col('weighted').sum().alias('total_weighted') ) .with_columns( (pl.col('total_weighted') / pl.col('total_count')).alias('tj_stuff_plus'), pl.lit("All").alias('pitch_type') ) .select(['pitcher_id', 'pitcher_name', 'pitch_type', pl.col('total_count').alias('count'), 'tj_stuff_plus']) ) df_small = pl.concat([df_agg, df_weighted_avg]) count_dict = dict(zip( df_small.filter(pl.col('pitch_type') == 'All')['pitcher_id'], df_small.filter(pl.col('pitch_type') == 'All')['count'] )) df_pivot = ( df_small.pivot(index=['pitcher_id', 'pitcher_name'], on='pitch_type', values='tj_stuff_plus') .with_columns(pl.col("pitcher_id").replace_strict(count_dict, default=None).alias("count")) ) # Ensure all pitch type columns exist missing_cols = [col for col in PITCH_TYPES if col not in df_pivot.columns] if missing_cols: df_pivot = df_pivot.with_columns([pl.lit(None).alias(col) for col in missing_cols]) return df_pivot # ========================================================================= # DOWNLOADS # ========================================================================= @session.download(filename="data.csv") def download_all(): yield ts_data().write_csv() @session.download(filename="data_daily.csv") def download_daily(): yield ts_data_daily().write_csv() @session.download(filename="data_tjstuff.csv") def download_tjsumm(): yield ts_data_summ().write_csv() # ========================================================================= # TABLE RENDERERS # ========================================================================= @output @render_tabulator @reactive.event(input.refresh) def table_all(): df_stuff = stuff_data() df_group = process_and_aggregate(df_stuff, ['pitcher_id', 'pitcher_name', 'pitch_type']) df_merge = merge_and_format(df_group, df_year_old_group) df_merge = df_merge.filter(pl.col('count') >= int(input.pitches_all_min())) df_plot = add_team_column(df_merge.to_pandas(), spring_data()) return Tabulator(df_plot, table_options=TableOptions(height=750, columns=get_all_columns())) @output @render_tabulator @reactive.event(input.refresh) def table_all_compare(): """Compare last game to season data.""" df_stuff = stuff_data() # Split into last game vs prior games last_game_dates = df_stuff.group_by("pitcher_id").agg( pl.col("game_date").max().alias("last_game_date") ) df_stuff = df_stuff.join(last_game_dates, on="pitcher_id") df_last = df_stuff.filter(pl.col("game_date") == pl.col("last_game_date")) df_prior = df_stuff.filter(pl.col("game_date") != pl.col("last_game_date")) # Aggregate both datasets using shared function df_last_group = process_and_aggregate(df_last, ['pitcher_id', 'pitcher_name', 'pitch_type']) df_prior_group = process_and_aggregate(df_prior, ['pitcher_id', 'pitcher_name', 'pitch_type']) # Merge and format with prior suffix df_merge = df_last_group.join(df_prior_group, on=["pitcher_id", "pitch_type"], how="left", suffix="_prior") cols_prior = [(n, n.replace("_old", "_prior")) for n, _ in COLS_TO_SUBTRACT] cols_percent_prior = [(n, n.replace("_old", "_prior")) for n, _ in COLS_TO_SUBTRACT_PERCENT] df_merge = mark_new_pitches(df_merge, df_prior_group, "start_speed_prior") df_merge = format_diff_columns(df_merge, cols_prior) df_merge = format_percent_diff_columns(df_merge, cols_percent_prior) df_merge = df_merge.filter(pl.col('count') >= int(input.pitches_all_compare_min())) df_plot = add_team_column(df_merge.to_pandas(), spring_data()) return Tabulator(df_plot, table_options=TableOptions(height=750, columns=get_all_columns())) @output @render_tabulator @reactive.event(input.refresh) def table_daily(): """Daily breakdown by game.""" df_stuff = stuff_data() join_cols = ["pitcher_id", 'game_id'] group_cols = ['pitcher_id', 'pitcher_name', 'pitch_type', 'game_id', 'game_date'] df_group = process_and_aggregate(df_stuff, group_cols, join_cols) df_merge = merge_and_format(df_group, df_year_old_group) df_merge = df_merge.filter(pl.col('count') >= int(input.pitches_daily_min())) df_plot = add_team_column(df_merge.to_pandas(), spring_data()) return Tabulator(df_plot, table_options=TableOptions(height=750, columns=get_daily_columns())) @output @render_tabulator @reactive.event(input.refresh) def table_tjstuff(): """tjStuff+ table with 2024 comparison.""" df_stuff = stuff_data() df_group = process_and_aggregate(df_stuff, ['pitcher_id', 'pitcher_name', 'pitch_type']) df_merge = df_group.join(df_year_old_group, on=['pitcher_id', 'pitch_type'], how='left', suffix='_old') df_merge = mark_new_pitches(df_merge, df_year_old_group) # Simple formatting without diff brackets (uses native Polars) df_merge = df_merge.with_columns([ pl.when(pl.col(old).is_null()) .then(pl.lit(None)) .otherwise(pl.col(new) - pl.col(old)) .alias(new + "_diff") for new, old in COLS_TO_SUBTRACT ]) df_merge = df_merge.with_columns([ pl.col(new).round(1).cast(pl.Utf8).alias(new + "_formatted") for new, _ in COLS_TO_SUBTRACT ]) # Format tjStuff+ old and diff using native Polars (no map_elements) df_merge = df_merge.with_columns([ pl.col("tj_stuff_plus_old").round(1).cast(pl.Utf8).alias("tj_stuff_plus_old"), pl.when(pl.col("tj_stuff_plus_diff") >= 0) .then(pl.lit("+") + pl.col("tj_stuff_plus_diff").round(1).cast(pl.Utf8)) .otherwise(pl.col("tj_stuff_plus_diff").round(1).cast(pl.Utf8)) .alias("tj_stuff_plus_diff") ]) # Format percent columns using native Polars df_merge = df_merge.with_columns([ ((pl.col(col) * 100).round(1).cast(pl.Utf8) + "%").alias(col + "_formatted") for col in ['pitch_percent', 'rhh_percent', 'lhh_percent'] ]).sort(['pitcher_id', 'count'], descending=True) df_merge = df_merge.filter(pl.col('count') >= int(input.pitches_tjstuff_min())) df_plot = add_team_column(df_merge.to_pandas(), spring_data()) return Tabulator(df_plot, table_options=TableOptions(height=750, columns=get_tjstuff_columns())) @output @render_tabulator @reactive.event(input.refresh) def table_stuff_all(): """tjStuff+ summary pivot table.""" df_pivot = ts_data_summ() df_pivot = df_pivot.with_columns([ pl.col(col).round(0).alias(col) for col in PITCH_TYPES ]) df_pivot = df_pivot.filter(pl.col('count') >= int(input.pitches_tjsumm_min())) df_plot = add_team_column(df_pivot.sort(['pitcher_id', 'count'], descending=True).to_pandas(), spring_data()) return Tabulator(df_plot, table_options=TableOptions(height=750, columns=get_summary_columns())) @output @render_tabulator @reactive.event(input.refresh) def table_tjstuff_team(): """tjStuff+ by team.""" df_stuff = stuff_data() df_team = df_stuff.group_by(['pitcher_team']).agg([ pl.len().alias('count'), pl.col('tj_stuff_plus').mean().round(0).alias('tj_stuff_plus'), ]).sort(['tj_stuff_plus'], descending=True) return Tabulator(df_team.to_pandas(), table_options=TableOptions(height=750, columns=get_team_columns())) app = App(app_ui, server)