Spaces:

asoni9
/

eller_nxp

Sleeping

Anirudha Soni

Enable reference curve function

401443c 7 months ago

19.5 kB

	import pandas as pd
	import numpy as np
	from prepare import Process
	import dash
	from dash import dcc, html
	from dash.dependencies import Input, Output
	import plotly.graph_objects as go
	from datetime import datetime
	from dateutil.relativedelta import relativedelta

	from scipy.stats import norm

	process_data = Process(
	transaction_file='transaction_data.parquet',
	quote_data_dir='quote_data/',
	end_date='2025-07-01',
	mini = 5000,
	maxi = 500000,
	ct = 100
	)
	df_origin = process_data.run()

	df_lost = pd.read_parquet("lost_preds.parquet")

	pricing_df1 = pd.read_parquet("price_curve.parquet")
	pricing_df2 = pd.read_parquet("price_curve2.parquet")

	pricing_df1.columns = pricing_df1.columns.str.strip()

	# now apply the rename mapping
	rename_map = {
	"+5Ku": "5000",
	"+10Ku": "10000",
	"+25Ku": "25000",
	"+50Ku": "50000",
	"+100Ku": "100000",
	"+250Ku": "250000",
	"+500Ku": "500000",
	"+1Ku": "1000",
	"+1u": "1",
	"+1Mu": "1000000"
	}
	pricing_df_renamed = pricing_df1.rename(columns=rename_map)
	cols_to_keep = ["product-anonymized","DBC","5000","10000","25000","50000","100000","250000","500000"]
	pricing_df2.columns = pricing_df2.columns.map(str)

	df1 = pricing_df_renamed[cols_to_keep]
	df2 = pricing_df2[cols_to_keep]

	pricing_df = pd.concat([df1, df2], ignore_index=True)

	logit_cols = ['QUOTE_ITEM_CREATE_DATE', 'QUOTE_QTY', 'FF_REGION', 'FINAL_QUOTED_RESALE_USD', 'DISTI_MARGIN', 'REGISTRATION_EFFORT', 'product', 'WON/LOSS', 'log_quote_qty', 'age']
	df = df_origin[df_origin['END_CUSTOMER_CATEGORY'].isin(['Tier 4 - TMMA', 'Tier 4 - ROM', 'Tier 4 - Longtail'])][logit_cols].copy()

	df.rename(columns={
	'product': 'product_id',
	'FINAL_QUOTED_RESALE_USD': 'final_trade_price',
	'log_quote_qty' : 'log_QUOTE_QTY',
	'WON/LOSS':'is_win'
	}, inplace=True)

	product_counts = df['product_id'].value_counts()
	keys = product_counts[product_counts > 100].index.tolist()
	df = df[df['product_id'].isin(keys)]

	df['QUOTE_ITEM_CREATE_DATE'] = pd.to_datetime(df['QUOTE_ITEM_CREATE_DATE'])
	df['logprice'] = np.log1p(df['final_trade_price'])
	df['logproduct'] = df['logprice']*df['log_QUOTE_QTY']

	# Create lag related features
	df.sort_values(['product_id', 'QUOTE_ITEM_CREATE_DATE'], inplace=True)

	df['logprice_lag1'] = df.groupby('product_id')['logprice'].shift(1)
	df['logprice_lag2'] = df.groupby('product_id')['logprice'].shift(2)
	df['logprice_lag3'] = df.groupby('product_id')['logprice'].shift(3)

	df['logproduct_lag1'] = df.groupby('product_id')['logproduct'].shift(1)
	df['logproduct_lag2'] = df.groupby('product_id')['logproduct'].shift(2)
	df['logproduct_lag3'] = df.groupby('product_id')['logproduct'].shift(3)

	df['logprice_mean7'] = df.groupby('product_id')['logprice_lag1'].transform(
	lambda x: x.rolling(window=7, min_periods=1).mean()
	)
	df['logproduct_mean7'] = df.groupby('product_id')['logproduct_lag1'].transform(
	lambda x: x.rolling(window=7, min_periods=1).mean()
	)

	df.dropna(subset=['logprice_lag1', 'logprice_lag2', 'logprice_lag3','final_trade_price'], inplace=True)
	df.sort_values('QUOTE_ITEM_CREATE_DATE', inplace=True)
	df.reset_index(drop=True, inplace=True)

	required_columns = [
	'product_id',
	'QUOTE_QTY',
	'QUOTE_ITEM_CREATE_DATE',
	'final_trade_price',
	'predicted_price'
	]
	df_accepted = pd.read_parquet("df_accepted_for_dashboard.parquet", columns=required_columns)

	def get_historical_features(product_id, historical_df):
	product_history = historical_df[historical_df['product_id'] == product_id].sort_values('QUOTE_ITEM_CREATE_DATE')
	if product_history.empty:
	return {'logprice_lag1': 0, 'logprice_lag2': 0, 'logprice_lag3': 0, 'logproduct_lag1': 0, 'logproduct_lag2': 0, 'logproduct_lag3': 0, 'logprice_mean7': 0, 'logproduct_mean7': 0}

	last_record = product_history.iloc[-1]
	features = {
	'logprice_lag1': last_record['logprice'],
	'logprice_lag2': last_record['logprice_lag1'],
	'logprice_lag3': last_record['logprice_lag2'],
	'logproduct_lag1': last_record['logproduct'],
	'logproduct_lag2': last_record['logproduct_lag1'],
	'logproduct_lag3': last_record['logproduct_lag2'],
	}
	recent_history = product_history.set_index('QUOTE_ITEM_CREATE_DATE').tail(7)
	features['logprice_mean7'] = recent_history['logprice_lag1'].mean()
	features['logproduct_mean7'] = recent_history['logproduct_lag1'].mean()

	reg_effort_mode = product_history['REGISTRATION_EFFORT'].mode()
	features['REGISTRATION_EFFORT'] = reg_effort_mode[0] if not reg_effort_mode.empty else 'FRP'
	features['DISTI_MARGIN'] = product_history['DISTI_MARGIN'].median()

	return features

	def calculate_imr(quote_data, probit_model, probit_feature_columns):
	prediction_df = pd.DataFrame(columns=probit_feature_columns, index=[0], dtype=float).fillna(0)
	for col, value in quote_data.items():
	dummy_col = f"{col}_{value}"
	if dummy_col in prediction_df.columns:
	prediction_df[dummy_col] = 1
	for col, value in quote_data.items():
	if col in prediction_df.columns:
	prediction_df[col] = value
	prediction_df['const'] = 1

	z_score = probit_model.predict(prediction_df[probit_feature_columns], which="linear").iloc[0]
	pdf = norm.pdf(z_score)
	cdf = norm.cdf(z_score)
	return pdf / cdf if cdf > 0 else 0

	def predict_price_with_heckman(new_quote_data, historical_df, probit_model, probit_cols, xgb_model):
	new_quote_data['log_QUOTE_QTY'] = np.log1p(new_quote_data['QUOTE_QTY'])
	historical_features = get_historical_features(new_quote_data['product_id'], historical_df)
	full_quote_data = {new_quote_data, historical_features}

	# Use logprice_lag1 as the proxy for final_trade_price in the probit model
	full_quote_data['final_trade_price'] = np.expm1(full_quote_data.get('logprice_lag1', 0))
	full_quote_data['logprice'] = np.log1p(full_quote_data['final_trade_price'])
	full_quote_data['logproduct'] = full_quote_data['logprice'] * full_quote_data['log_QUOTE_QTY']

	imr_value = calculate_imr(full_quote_data, probit_model, probit_cols)
	full_quote_data['inverse_mills_ratio'] = imr_value

	prediction_df_xgb = pd.DataFrame([full_quote_data])
	for col in ['FF_REGION', 'product_id']:
	if col in prediction_df_xgb.columns:
	prediction_df_xgb[col] = prediction_df_xgb[col].astype('category')

	predicted_log_price = xgb_model.predict(prediction_df_xgb[xgb_model.feature_names_in_])
	return np.expm1(predicted_log_price)[0]

	def predict_price_with_heckman_iterative(new_quote_data, historical_df, probit_model, probit_cols, xgb_model, naive_model, iterations=4):
	"""
	Predicts the price using a multi-step iterative approach for the IMR.
	"""
	# Get static historical features once
	new_quote_data['log_QUOTE_QTY'] = np.log1p(new_quote_data['QUOTE_QTY'])
	historical_features = get_historical_features(new_quote_data['product_id'], historical_df)
	full_quote_data = {new_quote_data, historical_features}

	# --- Step 1: Naive Prediction with IMR=0 to get a better price proxy ---
	prediction_df_naive = pd.DataFrame([full_quote_data])
	for col in ['FF_REGION', 'product_id']:
	if col in prediction_df_naive.columns:
	prediction_df_naive[col] = prediction_df_naive[col].astype('category')

	naive_log_price = naive_model.predict(prediction_df_naive[naive_model.feature_names_in_])
	current_price_proxy = np.expm1(naive_log_price)[0]

	# current_price_proxy = np.mean(np.expm1(historical_df['logprice']))


	# print(current_price_proxy)

	# --- Step 2: Iterative Refinement ---
	imr_value = 0
	final_predicted_price = 0

	for _ in range(iterations):
	# Prepare data for the current iteration
	probit_input = full_quote_data.copy()

	# Use the latest price proxy to calculate IMR
	probit_input['final_trade_price'] = current_price_proxy
	probit_input['logprice'] = np.log1p(current_price_proxy)
	probit_input['logproduct'] = probit_input['logprice'] * probit_input['log_QUOTE_QTY']

	imr_value = calculate_imr(probit_input, probit_model, probit_cols)
	full_quote_data['inverse_mills_ratio'] = imr_value

	# Predict the price with the new IMR
	prediction_df_xgb_final = pd.DataFrame([full_quote_data])
	for col in ['FF_REGION', 'product_id']:
	if col in prediction_df_xgb_final.columns:
	prediction_df_xgb_final[col] = prediction_df_xgb_final[col].astype('category')

	predicted_log_price = xgb_model.predict(prediction_df_xgb_final[xgb_model.feature_names_in_])
	final_predicted_price = np.expm1(predicted_log_price)[0]

	# The new prediction becomes the proxy for the next iteration
	current_price_proxy = final_predicted_price

	# print(final_predicted_price)
	# print("-----")

	return final_predicted_price

	final_predictions_df = pd.read_parquet("FinalPredictions.parquet")

	product_counts = df[df['is_win'] == 1]['product_id'].value_counts()
	ALL_PRODUCT_IDS = product_counts.index.tolist()
	tops = [14028934426778, 14029403716778, 14029037911778, 14031459622778, 14029657192778, 14029408741778, 14029714238363, 14028933256778, 14029726163363, 14029189621778, 14029135081778, 14029672028363, 14030192348363, 14029043536778, 14031459623363, 14030358713363, 14029650878363, 14029585530188, 14029617390188, 14029711148363, 14030989013363, 14029499296778, 14029435987778, 14030358728363, 14029286461778, 14029137196778, 14029161241778, 14029769813363, 14029539031733, 14029711147778, 14030487623363, 14029650877778, 14030358788363, 14029408732778, 14029405456733, 14029803323363, 14030675078363, 14030675183363, 14029405471733, 14029870838363, 14029585455623, 14029724033363, 14030280698363, 14029485331778, 14029727738363, 14029137241778, 14029829303363, 14029460477213, 14031251048363, 14029287541778]
	ALL_PRODUCT_IDS = tops + ALL_PRODUCT_IDS[50:]
	ALL_REGIONS = sorted(df['FF_REGION'].unique())

	app = dash.Dash(__name__, external_stylesheets=['https://codepen.io/chriddyp/pen/bWLwgP.css'])
	app.title = "Predictive Pricing Dashboard"

	dropdown_options = []
	for pid in ALL_PRODUCT_IDS:
	has_ref_curve = not pricing_df[pricing_df['product-anonymized'] == int(pid)].empty
	label = f"{pid}{' (*)' if has_ref_curve else ''}"
	dropdown_options.append({'label': label, 'value': pid})

	app.layout = html.Div(
	style={'padding': '20px', 'fontFamily': 'Arial, sans-serif'},
	children=[
	# Title + Subtitle
	html.H1(
	"Predictive Pricing Dashboard",
	style={'textAlign': 'center', 'marginLeft': '-10%'} # shift left
	),
	html.P(
	"Select a product to see its predicted price curves across different regions.",
	style={'textAlign': 'center', 'marginTop': '0', 'color': 'gray', 'marginLeft': '-10%'}
	),

	# Main Content Row
	html.Div(
	style={'display': 'flex', 'gap': '20px', 'padding': '20px'},
	children=[

	# --- Left Column: Controls ---
	html.Div(
	style={'width': '30%', 'backgroundColor': '#f9f9f9', 'padding': '20px', 'borderRadius': '8px'},
	children=[
	html.H4("Controls", style={'textAlign': 'center', 'marginBottom': '15px'}),

	html.Label("Select Product ID:", style={'fontWeight': 'bold'}),

	dcc.Dropdown(
	id='product-id-dropdown',
	options=dropdown_options,
	value=ALL_PRODUCT_IDS[1],
	style={'marginBottom': '20px'}
	),


	html.Label("Quantity Scale for Plot:", style={'fontWeight': 'bold'}),
	dcc.RadioItems(
	id='scale-radio',
	options=[
	{'label': 'Linear', 'value': 'linear'},
	{'label': 'Log', 'value': 'log'}
	],
	value='linear',
	labelStyle={'display': 'block'},
	style={'marginBottom': '20px'}
	),

	dcc.Checklist(
	id='show-pricing-df-checklist',
	options=[{'label': 'Show Reference Price Curve', 'value': 'show_df'}],
	value=[],
	style={'marginBottom': '30px'}
	),

	# Table Section
	html.Div(children=[
	html.H4("Historical Price Analysis", style={'textAlign': 'center', 'marginBottom': '15px'}),

	html.Div(
	style={'display': 'flex', 'gap': '10px', 'justifyContent': 'center', 'marginBottom': '15px'},
	children=[
	html.Label("Min Qty:"),
	dcc.Input(id='min-qty-input', type='number', value=50000, style={'width': '100px'}),
	html.Label("Max Qty:"),
	dcc.Input(id='max-qty-input', type='number', value=100000, style={'width': '100px'})
	]
	),

	html.Table(
	id='price-analysis-table',
	className='table',
	style={'margin': 'auto'}
	)
	])
	]
	),

	# --- Right Column: Outputs ---
	html.Div(
	style={'width': '70%', 'textAlign': 'center'},
	children=[
	html.Div(
	style={'width': '800px', 'height': '600px', 'margin': 'auto'},
	children=[
	dcc.Graph(id='price-curve-graph', style={'height': '100%', 'width': '100%'})
	]
	)
	]
	)
	]
	)
	]
	)

	@app.callback(
	[Output('price-curve-graph', 'figure'),
	Output('price-analysis-table', 'children'),],
	[Input('product-id-dropdown', 'value'),
	Input('scale-radio', 'value'),
	Input('show-pricing-df-checklist', 'value'),
	Input('min-qty-input', 'value'),
	Input('max-qty-input', 'value')]
	)
	def update_dashboard(selected_product_id, scale_type, show_df_value, min_qty, max_qty):
	# --- 1. Filter and Pivot Data ---
	fig = go.Figure()
	product_curve_data = final_predictions_df[final_predictions_df['product_id'] == selected_product_id]

	if not product_curve_data.empty:
	pivoted_data = product_curve_data.pivot(index='qty', columns='region', values='predicted_price').dropna()

	# --- 2. Calculate Percentiles and Global Curve ---
	monotonic_pivoted_data = pivoted_data.cummin()

	# Calculate the 25th and 75th percentiles for the band
	p75 = monotonic_pivoted_data.quantile(0.75, axis=1) # 75th percentile
	p25 = monotonic_pivoted_data.quantile(0.25, axis=1) # 25th percentile
	global_curve_median = monotonic_pivoted_data.median(axis=1)

	# --- 3. Create the Plot with a Single, Tighter Band ---
	# Add the traces for the 25th-75th percentile band
	fig.add_trace(go.Scatter(
	x=p25.index, y=p25,
	fill=None, mode='lines', line_color='rgba(100, 100, 100, 0.3)', showlegend=False
	))
	fig.add_trace(go.Scatter(
	x=p75.index, y=p75,
	fill='tonexty', # Fill the area between this trace and the one above
	mode='lines', line_color='rgba(100, 100, 100, 0.3)',
	name='Interquartile Price Range (25-75%)'
	))

	# Plot the Global Median Curve on top
	fig.add_trace(go.Scatter(
	x=global_curve_median.index, y=global_curve_median,
	mode='lines', name='Global Median Price Curve',
	line=dict(color='black', dash='dash', width=2)
	))

	# --- 3b. If user wants reference curve, plot it ---
	if 'show_df' in show_df_value:
	ref_curve_data = pricing_df[pricing_df['product-anonymized'] == int(selected_product_id)]
	if not ref_curve_data.empty:
	ref_curve_numeric = ref_curve_data.drop(columns=['product-anonymized', 'DBC'], errors='ignore').T
	ref_curve_numeric.columns = ['Reference Curve']
	ref_curve_numeric.index = pd.to_numeric(ref_curve_numeric.index, errors="coerce")
	ref_curve_numeric = ref_curve_numeric.dropna().sort_index()

	fig.add_trace(go.Scatter(
	x=ref_curve_numeric.index,
	y=ref_curve_numeric['Reference Curve'],
	mode='lines+markers',
	name='Reference Price Curve',
	line=dict(color='blue', width=2)
	))


	# --- 4. Update Layout and Other Components ---
	fig.update_layout(
	title=f"Target Pricing Corridor for Product: {selected_product_id}",
	xaxis_title="Quote Quantity",
	yaxis_title="Predicted Final Trade Price (USD)",
	legend_title="Price Range",
	template="plotly_white",
	xaxis_type=scale_type.lower()
	)

	# --- The rest of your callback logic remains the same ---
	# ... (code for table, histogram, and scatter plot) ...
	now = datetime.now()
	lookback_periods = {
	"Last 6 Months": now - relativedelta(months=6),
	"Last 1 Year": now - relativedelta(years=1),
	"Last 2 Years": now - relativedelta(years=2),
	"All Time": df_accepted['QUOTE_ITEM_CREATE_DATE'].min()
	}

	table_data = []
	for period_name, start_date in lookback_periods.items():
	hist_filtered = df_accepted[
	(df_accepted['product_id'] == str(selected_product_id)) &
	(df_accepted['QUOTE_QTY'] >= min_qty) &
	(df_accepted['QUOTE_QTY'] <= max_qty) &
	(df_accepted['QUOTE_ITEM_CREATE_DATE'] >= start_date)
	]

	if not hist_filtered.empty:
	avg_actual_price = hist_filtered['final_trade_price'].mean()
	avg_predicted_price = hist_filtered['predicted_price'].mean()
	num_trades = len(hist_filtered)

	table_data.append({
	"Lookback Period": period_name,
	"Avg. Actual Price": f"${avg_actual_price:.2f}",
	"Avg. Predicted Price": f"${avg_predicted_price:.2f}",
	"Number of Trades": num_trades
	})

	table_header = [html.Thead(html.Tr([html.Th(col) for col in ["Lookback Period", "Avg. Actual Price", "Avg. Predicted Price", "Number of Trades"]]))]
	table_body = [html.Tbody([html.Tr([html.Td(row[col]) for col in row]) for row in table_data])]


	return fig, table_header + table_body
	# return fig, table_header + table_body, lost_fig, scatter_fig

	server = app.server
	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=7860)