eller_nxp / app.py
Anirudha Soni
Enable reference curve function
401443c
import pandas as pd
import numpy as np
from prepare import Process
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objects as go
from datetime import datetime
from dateutil.relativedelta import relativedelta
from scipy.stats import norm
process_data = Process(
transaction_file='transaction_data.parquet',
quote_data_dir='quote_data/',
end_date='2025-07-01',
mini = 5000,
maxi = 500000,
ct = 100
)
df_origin = process_data.run()
df_lost = pd.read_parquet("lost_preds.parquet")
pricing_df1 = pd.read_parquet("price_curve.parquet")
pricing_df2 = pd.read_parquet("price_curve2.parquet")
pricing_df1.columns = pricing_df1.columns.str.strip()
# now apply the rename mapping
rename_map = {
"+5Ku": "5000",
"+10Ku": "10000",
"+25Ku": "25000",
"+50Ku": "50000",
"+100Ku": "100000",
"+250Ku": "250000",
"+500Ku": "500000",
"+1Ku": "1000",
"+1u": "1",
"+1Mu": "1000000"
}
pricing_df_renamed = pricing_df1.rename(columns=rename_map)
cols_to_keep = ["product-anonymized","DBC","5000","10000","25000","50000","100000","250000","500000"]
pricing_df2.columns = pricing_df2.columns.map(str)
df1 = pricing_df_renamed[cols_to_keep]
df2 = pricing_df2[cols_to_keep]
pricing_df = pd.concat([df1, df2], ignore_index=True)
logit_cols = ['QUOTE_ITEM_CREATE_DATE', 'QUOTE_QTY', 'FF_REGION', 'FINAL_QUOTED_RESALE_USD', 'DISTI_MARGIN', 'REGISTRATION_EFFORT', 'product', 'WON/LOSS', 'log_quote_qty', 'age']
df = df_origin[df_origin['END_CUSTOMER_CATEGORY'].isin(['Tier 4 - TMMA', 'Tier 4 - ROM', 'Tier 4 - Longtail'])][logit_cols].copy()
df.rename(columns={
'product': 'product_id',
'FINAL_QUOTED_RESALE_USD': 'final_trade_price',
'log_quote_qty' : 'log_QUOTE_QTY',
'WON/LOSS':'is_win'
}, inplace=True)
product_counts = df['product_id'].value_counts()
keys = product_counts[product_counts > 100].index.tolist()
df = df[df['product_id'].isin(keys)]
df['QUOTE_ITEM_CREATE_DATE'] = pd.to_datetime(df['QUOTE_ITEM_CREATE_DATE'])
df['logprice'] = np.log1p(df['final_trade_price'])
df['logproduct'] = df['logprice']*df['log_QUOTE_QTY']
# Create lag related features
df.sort_values(['product_id', 'QUOTE_ITEM_CREATE_DATE'], inplace=True)
df['logprice_lag1'] = df.groupby('product_id')['logprice'].shift(1)
df['logprice_lag2'] = df.groupby('product_id')['logprice'].shift(2)
df['logprice_lag3'] = df.groupby('product_id')['logprice'].shift(3)
df['logproduct_lag1'] = df.groupby('product_id')['logproduct'].shift(1)
df['logproduct_lag2'] = df.groupby('product_id')['logproduct'].shift(2)
df['logproduct_lag3'] = df.groupby('product_id')['logproduct'].shift(3)
df['logprice_mean7'] = df.groupby('product_id')['logprice_lag1'].transform(
lambda x: x.rolling(window=7, min_periods=1).mean()
)
df['logproduct_mean7'] = df.groupby('product_id')['logproduct_lag1'].transform(
lambda x: x.rolling(window=7, min_periods=1).mean()
)
df.dropna(subset=['logprice_lag1', 'logprice_lag2', 'logprice_lag3','final_trade_price'], inplace=True)
df.sort_values('QUOTE_ITEM_CREATE_DATE', inplace=True)
df.reset_index(drop=True, inplace=True)
required_columns = [
'product_id',
'QUOTE_QTY',
'QUOTE_ITEM_CREATE_DATE',
'final_trade_price',
'predicted_price'
]
df_accepted = pd.read_parquet("df_accepted_for_dashboard.parquet", columns=required_columns)
def get_historical_features(product_id, historical_df):
product_history = historical_df[historical_df['product_id'] == product_id].sort_values('QUOTE_ITEM_CREATE_DATE')
if product_history.empty:
return {'logprice_lag1': 0, 'logprice_lag2': 0, 'logprice_lag3': 0, 'logproduct_lag1': 0, 'logproduct_lag2': 0, 'logproduct_lag3': 0, 'logprice_mean7': 0, 'logproduct_mean7': 0}
last_record = product_history.iloc[-1]
features = {
'logprice_lag1': last_record['logprice'],
'logprice_lag2': last_record['logprice_lag1'],
'logprice_lag3': last_record['logprice_lag2'],
'logproduct_lag1': last_record['logproduct'],
'logproduct_lag2': last_record['logproduct_lag1'],
'logproduct_lag3': last_record['logproduct_lag2'],
}
recent_history = product_history.set_index('QUOTE_ITEM_CREATE_DATE').tail(7)
features['logprice_mean7'] = recent_history['logprice_lag1'].mean()
features['logproduct_mean7'] = recent_history['logproduct_lag1'].mean()
reg_effort_mode = product_history['REGISTRATION_EFFORT'].mode()
features['REGISTRATION_EFFORT'] = reg_effort_mode[0] if not reg_effort_mode.empty else 'FRP'
features['DISTI_MARGIN'] = product_history['DISTI_MARGIN'].median()
return features
def calculate_imr(quote_data, probit_model, probit_feature_columns):
prediction_df = pd.DataFrame(columns=probit_feature_columns, index=[0], dtype=float).fillna(0)
for col, value in quote_data.items():
dummy_col = f"{col}_{value}"
if dummy_col in prediction_df.columns:
prediction_df[dummy_col] = 1
for col, value in quote_data.items():
if col in prediction_df.columns:
prediction_df[col] = value
prediction_df['const'] = 1
z_score = probit_model.predict(prediction_df[probit_feature_columns], which="linear").iloc[0]
pdf = norm.pdf(z_score)
cdf = norm.cdf(z_score)
return pdf / cdf if cdf > 0 else 0
def predict_price_with_heckman(new_quote_data, historical_df, probit_model, probit_cols, xgb_model):
new_quote_data['log_QUOTE_QTY'] = np.log1p(new_quote_data['QUOTE_QTY'])
historical_features = get_historical_features(new_quote_data['product_id'], historical_df)
full_quote_data = {**new_quote_data, **historical_features}
# Use logprice_lag1 as the proxy for final_trade_price in the probit model
full_quote_data['final_trade_price'] = np.expm1(full_quote_data.get('logprice_lag1', 0))
full_quote_data['logprice'] = np.log1p(full_quote_data['final_trade_price'])
full_quote_data['logproduct'] = full_quote_data['logprice'] * full_quote_data['log_QUOTE_QTY']
imr_value = calculate_imr(full_quote_data, probit_model, probit_cols)
full_quote_data['inverse_mills_ratio'] = imr_value
prediction_df_xgb = pd.DataFrame([full_quote_data])
for col in ['FF_REGION', 'product_id']:
if col in prediction_df_xgb.columns:
prediction_df_xgb[col] = prediction_df_xgb[col].astype('category')
predicted_log_price = xgb_model.predict(prediction_df_xgb[xgb_model.feature_names_in_])
return np.expm1(predicted_log_price)[0]
def predict_price_with_heckman_iterative(new_quote_data, historical_df, probit_model, probit_cols, xgb_model, naive_model, iterations=4):
"""
Predicts the price using a multi-step iterative approach for the IMR.
"""
# Get static historical features once
new_quote_data['log_QUOTE_QTY'] = np.log1p(new_quote_data['QUOTE_QTY'])
historical_features = get_historical_features(new_quote_data['product_id'], historical_df)
full_quote_data = {**new_quote_data, **historical_features}
# --- Step 1: Naive Prediction with IMR=0 to get a better price proxy ---
prediction_df_naive = pd.DataFrame([full_quote_data])
for col in ['FF_REGION', 'product_id']:
if col in prediction_df_naive.columns:
prediction_df_naive[col] = prediction_df_naive[col].astype('category')
naive_log_price = naive_model.predict(prediction_df_naive[naive_model.feature_names_in_])
current_price_proxy = np.expm1(naive_log_price)[0]
# current_price_proxy = np.mean(np.expm1(historical_df['logprice']))
# print(current_price_proxy)
# --- Step 2: Iterative Refinement ---
imr_value = 0
final_predicted_price = 0
for _ in range(iterations):
# Prepare data for the current iteration
probit_input = full_quote_data.copy()
# Use the latest price proxy to calculate IMR
probit_input['final_trade_price'] = current_price_proxy
probit_input['logprice'] = np.log1p(current_price_proxy)
probit_input['logproduct'] = probit_input['logprice'] * probit_input['log_QUOTE_QTY']
imr_value = calculate_imr(probit_input, probit_model, probit_cols)
full_quote_data['inverse_mills_ratio'] = imr_value
# Predict the price with the new IMR
prediction_df_xgb_final = pd.DataFrame([full_quote_data])
for col in ['FF_REGION', 'product_id']:
if col in prediction_df_xgb_final.columns:
prediction_df_xgb_final[col] = prediction_df_xgb_final[col].astype('category')
predicted_log_price = xgb_model.predict(prediction_df_xgb_final[xgb_model.feature_names_in_])
final_predicted_price = np.expm1(predicted_log_price)[0]
# The new prediction becomes the proxy for the next iteration
current_price_proxy = final_predicted_price
# print(final_predicted_price)
# print("-----")
return final_predicted_price
final_predictions_df = pd.read_parquet("FinalPredictions.parquet")
product_counts = df[df['is_win'] == 1]['product_id'].value_counts()
ALL_PRODUCT_IDS = product_counts.index.tolist()
tops = [14028934426778, 14029403716778, 14029037911778, 14031459622778, 14029657192778, 14029408741778, 14029714238363, 14028933256778, 14029726163363, 14029189621778, 14029135081778, 14029672028363, 14030192348363, 14029043536778, 14031459623363, 14030358713363, 14029650878363, 14029585530188, 14029617390188, 14029711148363, 14030989013363, 14029499296778, 14029435987778, 14030358728363, 14029286461778, 14029137196778, 14029161241778, 14029769813363, 14029539031733, 14029711147778, 14030487623363, 14029650877778, 14030358788363, 14029408732778, 14029405456733, 14029803323363, 14030675078363, 14030675183363, 14029405471733, 14029870838363, 14029585455623, 14029724033363, 14030280698363, 14029485331778, 14029727738363, 14029137241778, 14029829303363, 14029460477213, 14031251048363, 14029287541778]
ALL_PRODUCT_IDS = tops + ALL_PRODUCT_IDS[50:]
ALL_REGIONS = sorted(df['FF_REGION'].unique())
app = dash.Dash(__name__, external_stylesheets=['https://codepen.io/chriddyp/pen/bWLwgP.css'])
app.title = "Predictive Pricing Dashboard"
dropdown_options = []
for pid in ALL_PRODUCT_IDS:
has_ref_curve = not pricing_df[pricing_df['product-anonymized'] == int(pid)].empty
label = f"{pid}{' (*)' if has_ref_curve else ''}"
dropdown_options.append({'label': label, 'value': pid})
app.layout = html.Div(
style={'padding': '20px', 'fontFamily': 'Arial, sans-serif'},
children=[
# Title + Subtitle
html.H1(
"Predictive Pricing Dashboard",
style={'textAlign': 'center', 'marginLeft': '-10%'} # shift left
),
html.P(
"Select a product to see its predicted price curves across different regions.",
style={'textAlign': 'center', 'marginTop': '0', 'color': 'gray', 'marginLeft': '-10%'}
),
# Main Content Row
html.Div(
style={'display': 'flex', 'gap': '20px', 'padding': '20px'},
children=[
# --- Left Column: Controls ---
html.Div(
style={'width': '30%', 'backgroundColor': '#f9f9f9', 'padding': '20px', 'borderRadius': '8px'},
children=[
html.H4("Controls", style={'textAlign': 'center', 'marginBottom': '15px'}),
html.Label("Select Product ID:", style={'fontWeight': 'bold'}),
dcc.Dropdown(
id='product-id-dropdown',
options=dropdown_options,
value=ALL_PRODUCT_IDS[1],
style={'marginBottom': '20px'}
),
html.Label("Quantity Scale for Plot:", style={'fontWeight': 'bold'}),
dcc.RadioItems(
id='scale-radio',
options=[
{'label': 'Linear', 'value': 'linear'},
{'label': 'Log', 'value': 'log'}
],
value='linear',
labelStyle={'display': 'block'},
style={'marginBottom': '20px'}
),
dcc.Checklist(
id='show-pricing-df-checklist',
options=[{'label': 'Show Reference Price Curve', 'value': 'show_df'}],
value=[],
style={'marginBottom': '30px'}
),
# Table Section
html.Div(children=[
html.H4("Historical Price Analysis", style={'textAlign': 'center', 'marginBottom': '15px'}),
html.Div(
style={'display': 'flex', 'gap': '10px', 'justifyContent': 'center', 'marginBottom': '15px'},
children=[
html.Label("Min Qty:"),
dcc.Input(id='min-qty-input', type='number', value=50000, style={'width': '100px'}),
html.Label("Max Qty:"),
dcc.Input(id='max-qty-input', type='number', value=100000, style={'width': '100px'})
]
),
html.Table(
id='price-analysis-table',
className='table',
style={'margin': 'auto'}
)
])
]
),
# --- Right Column: Outputs ---
html.Div(
style={'width': '70%', 'textAlign': 'center'},
children=[
html.Div(
style={'width': '800px', 'height': '600px', 'margin': 'auto'},
children=[
dcc.Graph(id='price-curve-graph', style={'height': '100%', 'width': '100%'})
]
)
]
)
]
)
]
)
@app.callback(
[Output('price-curve-graph', 'figure'),
Output('price-analysis-table', 'children'),],
[Input('product-id-dropdown', 'value'),
Input('scale-radio', 'value'),
Input('show-pricing-df-checklist', 'value'),
Input('min-qty-input', 'value'),
Input('max-qty-input', 'value')]
)
def update_dashboard(selected_product_id, scale_type, show_df_value, min_qty, max_qty):
# --- 1. Filter and Pivot Data ---
fig = go.Figure()
product_curve_data = final_predictions_df[final_predictions_df['product_id'] == selected_product_id]
if not product_curve_data.empty:
pivoted_data = product_curve_data.pivot(index='qty', columns='region', values='predicted_price').dropna()
# --- 2. Calculate Percentiles and Global Curve ---
monotonic_pivoted_data = pivoted_data.cummin()
# Calculate the 25th and 75th percentiles for the band
p75 = monotonic_pivoted_data.quantile(0.75, axis=1) # 75th percentile
p25 = monotonic_pivoted_data.quantile(0.25, axis=1) # 25th percentile
global_curve_median = monotonic_pivoted_data.median(axis=1)
# --- 3. Create the Plot with a Single, Tighter Band ---
# Add the traces for the 25th-75th percentile band
fig.add_trace(go.Scatter(
x=p25.index, y=p25,
fill=None, mode='lines', line_color='rgba(100, 100, 100, 0.3)', showlegend=False
))
fig.add_trace(go.Scatter(
x=p75.index, y=p75,
fill='tonexty', # Fill the area between this trace and the one above
mode='lines', line_color='rgba(100, 100, 100, 0.3)',
name='Interquartile Price Range (25-75%)'
))
# Plot the Global Median Curve on top
fig.add_trace(go.Scatter(
x=global_curve_median.index, y=global_curve_median,
mode='lines', name='Global Median Price Curve',
line=dict(color='black', dash='dash', width=2)
))
# --- 3b. If user wants reference curve, plot it ---
if 'show_df' in show_df_value:
ref_curve_data = pricing_df[pricing_df['product-anonymized'] == int(selected_product_id)]
if not ref_curve_data.empty:
ref_curve_numeric = ref_curve_data.drop(columns=['product-anonymized', 'DBC'], errors='ignore').T
ref_curve_numeric.columns = ['Reference Curve']
ref_curve_numeric.index = pd.to_numeric(ref_curve_numeric.index, errors="coerce")
ref_curve_numeric = ref_curve_numeric.dropna().sort_index()
fig.add_trace(go.Scatter(
x=ref_curve_numeric.index,
y=ref_curve_numeric['Reference Curve'],
mode='lines+markers',
name='Reference Price Curve',
line=dict(color='blue', width=2)
))
# --- 4. Update Layout and Other Components ---
fig.update_layout(
title=f"Target Pricing Corridor for Product: {selected_product_id}",
xaxis_title="Quote Quantity",
yaxis_title="Predicted Final Trade Price (USD)",
legend_title="Price Range",
template="plotly_white",
xaxis_type=scale_type.lower()
)
# --- The rest of your callback logic remains the same ---
# ... (code for table, histogram, and scatter plot) ...
now = datetime.now()
lookback_periods = {
"Last 6 Months": now - relativedelta(months=6),
"Last 1 Year": now - relativedelta(years=1),
"Last 2 Years": now - relativedelta(years=2),
"All Time": df_accepted['QUOTE_ITEM_CREATE_DATE'].min()
}
table_data = []
for period_name, start_date in lookback_periods.items():
hist_filtered = df_accepted[
(df_accepted['product_id'] == str(selected_product_id)) &
(df_accepted['QUOTE_QTY'] >= min_qty) &
(df_accepted['QUOTE_QTY'] <= max_qty) &
(df_accepted['QUOTE_ITEM_CREATE_DATE'] >= start_date)
]
if not hist_filtered.empty:
avg_actual_price = hist_filtered['final_trade_price'].mean()
avg_predicted_price = hist_filtered['predicted_price'].mean()
num_trades = len(hist_filtered)
table_data.append({
"Lookback Period": period_name,
"Avg. Actual Price": f"${avg_actual_price:.2f}",
"Avg. Predicted Price": f"${avg_predicted_price:.2f}",
"Number of Trades": num_trades
})
table_header = [html.Thead(html.Tr([html.Th(col) for col in ["Lookback Period", "Avg. Actual Price", "Avg. Predicted Price", "Number of Trades"]]))]
table_body = [html.Tbody([html.Tr([html.Td(row[col]) for col in row]) for row in table_data])]
return fig, table_header + table_body
# return fig, table_header + table_body, lost_fig, scatter_fig
server = app.server
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860)