import urllib.request from urllib.error import HTTPError import requests from bs4 import BeautifulSoup import os import json import streamlit as st import pandas as pd # import polars as pl from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode pos_dict = {1 :'P', 2 :'C', 3 :'1B', 4 :'2B', 5 :'3B', 6 :'SS', 7 :'LF', 8 :'CF', 9 :'RF', 10 :'DH'} # Set Streamlit page configuration st.set_page_config(layout="wide") # Inject custom CSS to set the width of the container to 1250px st.markdown( """ """, unsafe_allow_html=True ) # Wrap the main content in a container with the specified width st.markdown('
', unsafe_allow_html=True) # Example text with links and bullet points markdown_text = """ ## Catch Probability Lookup Tool ##### By: Thomas Nestico ([@TJStats](https://x.com/TJStats)) ##### Data: [MLB](https://baseballsavant.mlb.com/) #### About This Streamlit app retrieves catch probability data for a selected fielder from [Baseball Savant](https://baseballsavant.mlb.com/leaderboard/catch_probability). The app displays the fielder's data in a table and allows the user to select a row to view the corresponding video. Catch probability data is only available for outfielders. #### What is Catch Probability? *From MLB:* **Catch Probability** expresses the likelihood for a ball to be caught by an outfielder based on opportunity time, distance needed, and direction. “Opportunity time” starts when the ball is released by the pitcher, and “distance needed” is the shortest distance needed to make the catch. [Learn more about how direction is accounted for here](https://www.mlb.com/news/catch-probability-updated-to-include-direction-c232532408). [Read more about the details of how Catch Probability works here](https://www.mlb.com/news/statcast-introduces-catch-probability-for-2017-c217802340). """ markdown_text_end = ''' *Columns:* - **Batter Name**: Name of the batter - **Pitcher Name**: Name of the pitcher - **Fielder Name**: Name of the fielder - **Position**: Position of the fielder - **Event**: Type of play - **Out**: Was the ball caught? - **Wall**: [Did the fielder catch the ball at the wall?](https://www.mlb.com/news/catch-probability-updated-to-account-for-walls-c269814542) - **Back**: [Did the fielder catch the ball while moving back?](https://www.mlb.com/news/catch-probability-updated-to-include-direction-c232532408) - **Stars**: [Number of stars assigned to the play](https://baseballsavant.mlb.com/leaderboard/catch_probability) - **Distance**: Distance required to make the catch in feet - **Hang Time**: Hang time of the ball in seconds - **Catch Rate**: Probability of the catch being made ''' # Display the markdown text in Streamlit st.markdown(markdown_text) # Load the data import datasets from datasets import load_dataset ### Import Datasets season = 2025 level = 'mlb' # dataset = load_dataset('TJStatsApps/mlb_data', data_files=[f'{level}_pitch_data_{season}.csv' ]) # load the dataset file from your repo ds = load_dataset("TJStatsApps/mlb_data", data_files=f"data/{level}_pitch_data_{season}.parquet") dataset = ds["train"].to_pandas() # or to_polars() if you want Polars # dataset_train = dataset['train'] df = dataset.drop_duplicates(subset=['play_id'],keep='last') df['batter_name_team'] = df['batter_name'] + ' - ' + df['batter_team'] # Sample dictionary fielders = df.drop_duplicates(['batter_id']).sort_values(['batter_name']).set_index('batter_id')['batter_name_team'].to_dict() fielders_reversed = {v: k for k, v in fielders.items()} # Create a selectbox for selecting a key from the dictionary st.write("#### Select Fielder") selected_fielder = st.selectbox('',list(fielders_reversed.keys())) # Retrieve the corresponding pitcher ID fielder_select = fielders_reversed[selected_fielder] # Define the URL url = f"https://baseballsavant.mlb.com/player-services/range?playerId={fielder_select}&season={season}&playerType=fielder" # # Make the GET request # response = requests.get(url) # # Check if the request was successful # if response.status_code == 200: # print(response.status_code) # else: # print(f"Failed to fetch data, status code: {response.status_code}") # # Format the string as a JSON array # text = response.text # dict_catch = text.split('rangeLine')[1].split('data')[0].split('[')[1].split(']')[0] # formatted_dict_catch = f'[{dict_catch}]' # # Convert the formatted string to a list of dictionaries # dict_catch_list = json.loads(formatted_dict_catch) data = requests.get(url).json() df_catch = pd.DataFrame(data) if df_catch.empty: st.write("No data available for the selected fielder.") st.stop() df_catch['hang_time'] = df_catch['hang_time'].astype(float).round(1) df_catch['distance'] = df_catch['distance'].astype(float).round(1) df_merge = df.merge(df_catch, on='play_id', how='right', suffixes=('', '_fielder')).reset_index(drop=True) # Format the 'catch_rate' column as a percentage #df_merge['catch_rate'] = df_merge['catch_rate'].astype(float).apply(lambda x: f"{x:.0%}") df_merge['pos'] = df_merge['pos'].astype(int) df_merge['Position'] = df_merge['pos'].map(pos_dict) df_merge = df_merge[df_merge['batter_id'] != df_merge['player_id']] df_merge.sort_values(by='game_date',inplace=True) column_names = ['game_date','batter_name', 'pitcher_name', 'name_display_first_last', 'Position','event', 'out', 'wall', 'back', 'stars', 'distance', 'hang_time', 'catch_rate'] column_names_display = ['Game Date','Batter Name', 'Pitcher Name', 'Fielder Name', 'Position','Event', 'Out', 'Wall', 'Back', 'Stars', 'Distance', 'Hang Time', 'Catch Rate'] # Use a container to control the width of the AgGrid display with st.container(): st.write("#### Fielder Data") # Configure the AgGrid options gb = GridOptionsBuilder.from_dataframe(df_merge[column_names]) # Set display names for columns for col, display_name in zip(column_names, column_names_display): gb.configure_column(col, headerName=display_name) gb.configure_selection('single', use_checkbox=True) grid_options = gb.build() # Display the dataframe using AgGrid grid_response = AgGrid( df_merge[column_names], gridOptions=grid_options, update_mode=GridUpdateMode.SELECTION_CHANGED, height=300, allow_unsafe_jscode=True, width="100%", ) # Get the selected row index try: # Update the video URL based on the selected row selected_row_index = int(grid_response['selected_rows'].index.values[0]) play_link = f'https://baseballsavant.mlb.com/sporty-videos?playId={df_merge["play_id"].values[selected_row_index]}' #a = requests.get(f'https://baseballsavant.mlb.com/sporty-videos?playId={df_merge["play_id"].values[selected_row_index]}') #soup = BeautifulSoup(a.content, 'lxml') #video_url = str(soup).split('', unsafe_allow_html=True)