Spaces:

nesticot
/

catch_probability

Running

App Files Files Community

catch_probability / app.py

nesticot

Update app.py

06789cc verified 6 months ago

raw

history blame contribute delete

7.45 kB

	import urllib.request
	from urllib.error import HTTPError
	import requests
	from bs4 import BeautifulSoup
	import os
	import json
	import streamlit as st
	import pandas as pd
	# import polars as pl
	from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode


	pos_dict = {1 :'P',
	2 :'C',
	3 :'1B',
	4 :'2B',
	5 :'3B',
	6 :'SS',
	7 :'LF',
	8 :'CF',
	9 :'RF',
	10 :'DH'}

	# Set Streamlit page configuration
	st.set_page_config(layout="wide")

	# Inject custom CSS to set the width of the container to 1250px
	st.markdown(
	"""
	<style>
	.main-container {
	max-width: 1250px;
	margin: 0 auto;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	# Wrap the main content in a container with the specified width
	st.markdown('<div class="main-container">', unsafe_allow_html=True)

	# Example text with links and bullet points
	markdown_text = """
	## Catch Probability Lookup Tool

	##### By: Thomas Nestico ([@TJStats](https://x.com/TJStats))
	##### Data: [MLB](https://baseballsavant.mlb.com/)

	#### About
	This Streamlit app retrieves catch probability data for a selected fielder from [Baseball Savant](https://baseballsavant.mlb.com/leaderboard/catch_probability).
	The app displays the fielder's data in a table and allows the user to select a
	row to view the corresponding video.

	Catch probability data is only available for outfielders.

	#### What is Catch Probability?
	From MLB:

	Catch Probability expresses the likelihood for a ball to be caught by an outfielder based on opportunity time,
	distance needed, and direction. “Opportunity time” starts when the ball is released by the pitcher,
	and “distance needed” is the shortest distance needed to make the catch.
	[Learn more about how direction is accounted for here](https://www.mlb.com/news/catch-probability-updated-to-include-direction-c232532408).
	[Read more about the details of how Catch Probability works here](https://www.mlb.com/news/statcast-introduces-catch-probability-for-2017-c217802340).

	"""

	markdown_text_end = '''
	Columns:
	- Batter Name: Name of the batter
	- Pitcher Name: Name of the pitcher
	- Fielder Name: Name of the fielder
	- Position: Position of the fielder
	- Event: Type of play
	- Out: Was the ball caught?
	- Wall: [Did the fielder catch the ball at the wall?](https://www.mlb.com/news/catch-probability-updated-to-account-for-walls-c269814542)
	- Back: [Did the fielder catch the ball while moving back?](https://www.mlb.com/news/catch-probability-updated-to-include-direction-c232532408)
	- Stars: [Number of stars assigned to the play](https://baseballsavant.mlb.com/leaderboard/catch_probability)
	- Distance: Distance required to make the catch in feet
	- Hang Time: Hang time of the ball in seconds
	- Catch Rate: Probability of the catch being made
	'''

	# Display the markdown text in Streamlit
	st.markdown(markdown_text)


	# Load the data
	import datasets
	from datasets import load_dataset
	### Import Datasets
	season = 2025
	level = 'mlb'
	# dataset = load_dataset('TJStatsApps/mlb_data', data_files=[f'{level}_pitch_data_{season}.csv' ])
	# load the dataset file from your repo
	ds = load_dataset("TJStatsApps/mlb_data", data_files=f"data/{level}_pitch_data_{season}.parquet")
	dataset = ds["train"].to_pandas() # or to_polars() if you want Polars
	# dataset_train = dataset['train']
	df = dataset.drop_duplicates(subset=['play_id'],keep='last')
	df['batter_name_team'] = df['batter_name'] + ' - ' + df['batter_team']

	# Sample dictionary
	fielders = df.drop_duplicates(['batter_id']).sort_values(['batter_name']).set_index('batter_id')['batter_name_team'].to_dict()
	fielders_reversed = {v: k for k, v in fielders.items()}

	# Create a selectbox for selecting a key from the dictionary
	st.write("#### Select Fielder")
	selected_fielder = st.selectbox('',list(fielders_reversed.keys()))

	# Retrieve the corresponding pitcher ID
	fielder_select = fielders_reversed[selected_fielder]

	# Define the URL
	url = f"https://baseballsavant.mlb.com/player-services/range?playerId={fielder_select}&season={season}&playerType=fielder"

	# # Make the GET request
	# response = requests.get(url)

	# # Check if the request was successful
	# if response.status_code == 200:
	# print(response.status_code)
	# else:
	# print(f"Failed to fetch data, status code: {response.status_code}")

	# # Format the string as a JSON array
	# text = response.text
	# dict_catch = text.split('rangeLine')[1].split('data')[0].split('[')[1].split(']')[0]
	# formatted_dict_catch = f'[{dict_catch}]'

	# # Convert the formatted string to a list of dictionaries
	# dict_catch_list = json.loads(formatted_dict_catch)

	data = requests.get(url).json()
	df_catch = pd.DataFrame(data)

	if df_catch.empty:
	st.write("No data available for the selected fielder.")
	st.stop()

	df_catch['hang_time'] = df_catch['hang_time'].astype(float).round(1)
	df_catch['distance'] = df_catch['distance'].astype(float).round(1)

	df_merge = df.merge(df_catch, on='play_id', how='right', suffixes=('', '_fielder')).reset_index(drop=True)

	# Format the 'catch_rate' column as a percentage
	#df_merge['catch_rate'] = df_merge['catch_rate'].astype(float).apply(lambda x: f"{x:.0%}")

	df_merge['pos'] = df_merge['pos'].astype(int)
	df_merge['Position'] = df_merge['pos'].map(pos_dict)


	df_merge = df_merge[df_merge['batter_id'] != df_merge['player_id']]




	df_merge.sort_values(by='game_date',inplace=True)
	column_names = ['game_date','batter_name', 'pitcher_name', 'name_display_first_last', 'Position','event', 'out', 'wall', 'back', 'stars', 'distance', 'hang_time', 'catch_rate']
	column_names_display = ['Game Date','Batter Name', 'Pitcher Name', 'Fielder Name', 'Position','Event', 'Out', 'Wall', 'Back', 'Stars', 'Distance', 'Hang Time', 'Catch Rate']





	# Use a container to control the width of the AgGrid display
	with st.container():
	st.write("#### Fielder Data")
	# Configure the AgGrid options
	gb = GridOptionsBuilder.from_dataframe(df_merge[column_names])
	# Set display names for columns
	for col, display_name in zip(column_names, column_names_display):
	gb.configure_column(col, headerName=display_name)


	gb.configure_selection('single', use_checkbox=True)
	grid_options = gb.build()

	# Display the dataframe using AgGrid
	grid_response = AgGrid(
	df_merge[column_names],
	gridOptions=grid_options,
	update_mode=GridUpdateMode.SELECTION_CHANGED,
	height=300,
	allow_unsafe_jscode=True,
	width="100%",
	)

	# Get the selected row index


	try:
	# Update the video URL based on the selected row
	selected_row_index = int(grid_response['selected_rows'].index.values[0])
	play_link = f'https://baseballsavant.mlb.com/sporty-videos?playId={df_merge["play_id"].values[selected_row_index]}'
	#a = requests.get(f'https://baseballsavant.mlb.com/sporty-videos?playId={df_merge["play_id"].values[selected_row_index]}')
	#soup = BeautifulSoup(a.content, 'lxml')
	#video_url = str(soup).split('<source src="')[1].split('" ')[0]
	# Share the video through Streamlit
	#st.video(video_url)
	st.markdown( f'#### [Link to Video]({play_link})')

	#st.write("Select Row to Display Video")

	except AttributeError:
	st.write("#### Select Row to Get Video Link")

	st.markdown(markdown_text_end)
	st.markdown('</div>', unsafe_allow_html=True)