Spaces:

TJStatsApps
/

catch_probability

Paused

File size: 7,449 Bytes

import urllib.request
from urllib.error import HTTPError
import requests
from bs4 import BeautifulSoup
import os
import json
import streamlit as st
import pandas as pd
# import polars as pl
from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode


pos_dict = {1 :'P',
            2 :'C',
            3 :'1B',
            4 :'2B',
            5 :'3B',
            6 :'SS',
            7 :'LF',
            8 :'CF',
            9 :'RF',
            10 :'DH'}

# Set Streamlit page configuration
st.set_page_config(layout="wide")

# Inject custom CSS to set the width of the container to 1250px
st.markdown(
    """
    <style>
    .main-container {
        max-width: 1250px;
        margin: 0 auto;
    }
    </style>
    """,
    unsafe_allow_html=True
)

# Wrap the main content in a container with the specified width
st.markdown('<div class="main-container">', unsafe_allow_html=True)

# Example text with links and bullet points
markdown_text = """
## Catch Probability Lookup Tool

##### By: Thomas Nestico ([@TJStats](https://x.com/TJStats))
##### Data: [MLB](https://baseballsavant.mlb.com/)

#### About
This Streamlit app retrieves catch probability data for a selected fielder from [Baseball Savant](https://baseballsavant.mlb.com/leaderboard/catch_probability).
The app displays the fielder's data in a table and allows the user to select a 
row to view the corresponding video.

Catch probability data is only available for outfielders.

#### What is Catch Probability?
*From MLB:*

**Catch Probability** expresses the likelihood for a ball to be caught by an outfielder based on opportunity time, 
distance needed, and direction. “Opportunity time” starts when the ball is released by the pitcher, 
and “distance needed” is the shortest distance needed to make the catch.
[Learn more about how direction is accounted for here](https://www.mlb.com/news/catch-probability-updated-to-include-direction-c232532408). 
[Read more about the details of how Catch Probability works here](https://www.mlb.com/news/statcast-introduces-catch-probability-for-2017-c217802340).

"""

markdown_text_end = '''
*Columns:*
- **Batter Name**: Name of the batter
- **Pitcher Name**: Name of the pitcher
- **Fielder Name**: Name of the fielder
- **Position**: Position of the fielder
- **Event**: Type of play
- **Out**: Was the ball caught?
- **Wall**: [Did the fielder catch the ball at the wall?](https://www.mlb.com/news/catch-probability-updated-to-account-for-walls-c269814542)
- **Back**: [Did the fielder catch the ball while moving back?](https://www.mlb.com/news/catch-probability-updated-to-include-direction-c232532408)
- **Stars**: [Number of stars assigned to the play](https://baseballsavant.mlb.com/leaderboard/catch_probability)
- **Distance**: Distance required to make the catch in feet
- **Hang Time**: Hang time of the ball in seconds
- **Catch Rate**: Probability of the catch being made
'''

# Display the markdown text in Streamlit
st.markdown(markdown_text)


# Load the data
import datasets
from datasets import load_dataset
### Import Datasets
season = 2025
level = 'mlb'
# dataset = load_dataset('TJStatsApps/mlb_data', data_files=[f'{level}_pitch_data_{season}.csv' ])
# load the dataset file from your repo
ds = load_dataset("TJStatsApps/mlb_data", data_files=f"data/{level}_pitch_data_{season}.parquet")
dataset = ds["train"].to_pandas()   # or to_polars() if you want Polars
# dataset_train = dataset['train']
df = dataset.drop_duplicates(subset=['play_id'],keep='last')
df['batter_name_team'] = df['batter_name'] + ' - ' + df['batter_team']

# Sample dictionary
fielders = df.drop_duplicates(['batter_id']).sort_values(['batter_name']).set_index('batter_id')['batter_name_team'].to_dict()
fielders_reversed = {v: k for k, v in fielders.items()}

# Create a selectbox for selecting a key from the dictionary
st.write("#### Select Fielder")
selected_fielder = st.selectbox('',list(fielders_reversed.keys()))

# Retrieve the corresponding pitcher ID
fielder_select = fielders_reversed[selected_fielder]

# Define the URL
url = f"https://baseballsavant.mlb.com/player-services/range?playerId={fielder_select}&season={season}&playerType=fielder"

# # Make the GET request
# response = requests.get(url)

# # Check if the request was successful
# if response.status_code == 200:
#     print(response.status_code)
# else:
#     print(f"Failed to fetch data, status code: {response.status_code}")

# # Format the string as a JSON array
# text = response.text
# dict_catch = text.split('rangeLine')[1].split('data')[0].split('[')[1].split(']')[0]
# formatted_dict_catch = f'[{dict_catch}]'

# # Convert the formatted string to a list of dictionaries
# dict_catch_list = json.loads(formatted_dict_catch)

data = requests.get(url).json()
df_catch = pd.DataFrame(data)

if df_catch.empty:
    st.write("No data available for the selected fielder.")
    st.stop()

df_catch['hang_time'] = df_catch['hang_time'].astype(float).round(1)
df_catch['distance'] = df_catch['distance'].astype(float).round(1)

df_merge = df.merge(df_catch, on='play_id', how='right', suffixes=('', '_fielder')).reset_index(drop=True)

# Format the 'catch_rate' column as a percentage
#df_merge['catch_rate'] = df_merge['catch_rate'].astype(float).apply(lambda x: f"{x:.0%}")

df_merge['pos'] = df_merge['pos'].astype(int)
df_merge['Position'] = df_merge['pos'].map(pos_dict)


df_merge = df_merge[df_merge['batter_id'] != df_merge['player_id']]




df_merge.sort_values(by='game_date',inplace=True)
column_names = ['game_date','batter_name', 'pitcher_name', 'name_display_first_last', 'Position','event', 'out', 'wall', 'back', 'stars', 'distance', 'hang_time', 'catch_rate']
column_names_display = ['Game Date','Batter Name', 'Pitcher Name', 'Fielder Name', 'Position','Event', 'Out', 'Wall', 'Back', 'Stars', 'Distance', 'Hang Time', 'Catch Rate']





# Use a container to control the width of the AgGrid display
with st.container():
    st.write("#### Fielder Data")
    # Configure the AgGrid options
    gb = GridOptionsBuilder.from_dataframe(df_merge[column_names])
    # Set display names for columns
    for col, display_name in zip(column_names, column_names_display):
        gb.configure_column(col, headerName=display_name)


    gb.configure_selection('single', use_checkbox=True)
    grid_options = gb.build()

    # Display the dataframe using AgGrid
    grid_response = AgGrid(
        df_merge[column_names],
        gridOptions=grid_options,
        update_mode=GridUpdateMode.SELECTION_CHANGED,
        height=300,
        allow_unsafe_jscode=True,
        width="100%", 
    )

# Get the selected row index


try:
    # Update the video URL based on the selected row
    selected_row_index = int(grid_response['selected_rows'].index.values[0])
    play_link = f'https://baseballsavant.mlb.com/sporty-videos?playId={df_merge["play_id"].values[selected_row_index]}'
    #a = requests.get(f'https://baseballsavant.mlb.com/sporty-videos?playId={df_merge["play_id"].values[selected_row_index]}')
    #soup = BeautifulSoup(a.content, 'lxml')
    #video_url = str(soup).split('<source src="')[1].split('" ')[0]
    # Share the video through Streamlit
    #st.video(video_url)
    st.markdown( f'#### [Link to Video]({play_link})')

    #st.write("Select Row to Display Video")
    
except AttributeError:
    st.write("#### Select Row to Get Video Link")

st.markdown(markdown_text_end)
st.markdown('</div>', unsafe_allow_html=True)