| import pandas as pd |
| import numpy as np |
| from prepare import Process |
| import dash |
| from dash import dcc, html |
| from dash.dependencies import Input, Output |
| import plotly.graph_objects as go |
| from datetime import datetime |
| from dateutil.relativedelta import relativedelta |
|
|
| from scipy.stats import norm |
|
|
| process_data = Process( |
| transaction_file='transaction_data.parquet', |
| quote_data_dir='quote_data/', |
| end_date='2025-07-01', |
| mini = 5000, |
| maxi = 500000, |
| ct = 100 |
| ) |
| df_origin = process_data.run() |
|
|
| df_lost = pd.read_parquet("lost_preds.parquet") |
|
|
| pricing_df1 = pd.read_parquet("price_curve.parquet") |
| pricing_df2 = pd.read_parquet("price_curve2.parquet") |
|
|
| pricing_df1.columns = pricing_df1.columns.str.strip() |
|
|
| |
| rename_map = { |
| "+5Ku": "5000", |
| "+10Ku": "10000", |
| "+25Ku": "25000", |
| "+50Ku": "50000", |
| "+100Ku": "100000", |
| "+250Ku": "250000", |
| "+500Ku": "500000", |
| "+1Ku": "1000", |
| "+1u": "1", |
| "+1Mu": "1000000" |
| } |
| pricing_df_renamed = pricing_df1.rename(columns=rename_map) |
| cols_to_keep = ["product-anonymized","DBC","5000","10000","25000","50000","100000","250000","500000"] |
| pricing_df2.columns = pricing_df2.columns.map(str) |
|
|
| df1 = pricing_df_renamed[cols_to_keep] |
| df2 = pricing_df2[cols_to_keep] |
|
|
| pricing_df = pd.concat([df1, df2], ignore_index=True) |
|
|
| logit_cols = ['QUOTE_ITEM_CREATE_DATE', 'QUOTE_QTY', 'FF_REGION', 'FINAL_QUOTED_RESALE_USD', 'DISTI_MARGIN', 'REGISTRATION_EFFORT', 'product', 'WON/LOSS', 'log_quote_qty', 'age'] |
| df = df_origin[df_origin['END_CUSTOMER_CATEGORY'].isin(['Tier 4 - TMMA', 'Tier 4 - ROM', 'Tier 4 - Longtail'])][logit_cols].copy() |
|
|
| df.rename(columns={ |
| 'product': 'product_id', |
| 'FINAL_QUOTED_RESALE_USD': 'final_trade_price', |
| 'log_quote_qty' : 'log_QUOTE_QTY', |
| 'WON/LOSS':'is_win' |
| }, inplace=True) |
|
|
| product_counts = df['product_id'].value_counts() |
| keys = product_counts[product_counts > 100].index.tolist() |
| df = df[df['product_id'].isin(keys)] |
|
|
| df['QUOTE_ITEM_CREATE_DATE'] = pd.to_datetime(df['QUOTE_ITEM_CREATE_DATE']) |
| df['logprice'] = np.log1p(df['final_trade_price']) |
| df['logproduct'] = df['logprice']*df['log_QUOTE_QTY'] |
|
|
| |
| df.sort_values(['product_id', 'QUOTE_ITEM_CREATE_DATE'], inplace=True) |
|
|
| df['logprice_lag1'] = df.groupby('product_id')['logprice'].shift(1) |
| df['logprice_lag2'] = df.groupby('product_id')['logprice'].shift(2) |
| df['logprice_lag3'] = df.groupby('product_id')['logprice'].shift(3) |
|
|
| df['logproduct_lag1'] = df.groupby('product_id')['logproduct'].shift(1) |
| df['logproduct_lag2'] = df.groupby('product_id')['logproduct'].shift(2) |
| df['logproduct_lag3'] = df.groupby('product_id')['logproduct'].shift(3) |
|
|
| df['logprice_mean7'] = df.groupby('product_id')['logprice_lag1'].transform( |
| lambda x: x.rolling(window=7, min_periods=1).mean() |
| ) |
| df['logproduct_mean7'] = df.groupby('product_id')['logproduct_lag1'].transform( |
| lambda x: x.rolling(window=7, min_periods=1).mean() |
| ) |
|
|
| df.dropna(subset=['logprice_lag1', 'logprice_lag2', 'logprice_lag3','final_trade_price'], inplace=True) |
| df.sort_values('QUOTE_ITEM_CREATE_DATE', inplace=True) |
| df.reset_index(drop=True, inplace=True) |
|
|
| required_columns = [ |
| 'product_id', |
| 'QUOTE_QTY', |
| 'QUOTE_ITEM_CREATE_DATE', |
| 'final_trade_price', |
| 'predicted_price' |
| ] |
| df_accepted = pd.read_parquet("df_accepted_for_dashboard.parquet", columns=required_columns) |
|
|
| def get_historical_features(product_id, historical_df): |
| product_history = historical_df[historical_df['product_id'] == product_id].sort_values('QUOTE_ITEM_CREATE_DATE') |
| if product_history.empty: |
| return {'logprice_lag1': 0, 'logprice_lag2': 0, 'logprice_lag3': 0, 'logproduct_lag1': 0, 'logproduct_lag2': 0, 'logproduct_lag3': 0, 'logprice_mean7': 0, 'logproduct_mean7': 0} |
|
|
| last_record = product_history.iloc[-1] |
| features = { |
| 'logprice_lag1': last_record['logprice'], |
| 'logprice_lag2': last_record['logprice_lag1'], |
| 'logprice_lag3': last_record['logprice_lag2'], |
| 'logproduct_lag1': last_record['logproduct'], |
| 'logproduct_lag2': last_record['logproduct_lag1'], |
| 'logproduct_lag3': last_record['logproduct_lag2'], |
| } |
| recent_history = product_history.set_index('QUOTE_ITEM_CREATE_DATE').tail(7) |
| features['logprice_mean7'] = recent_history['logprice_lag1'].mean() |
| features['logproduct_mean7'] = recent_history['logproduct_lag1'].mean() |
|
|
| reg_effort_mode = product_history['REGISTRATION_EFFORT'].mode() |
| features['REGISTRATION_EFFORT'] = reg_effort_mode[0] if not reg_effort_mode.empty else 'FRP' |
| features['DISTI_MARGIN'] = product_history['DISTI_MARGIN'].median() |
|
|
| return features |
|
|
| def calculate_imr(quote_data, probit_model, probit_feature_columns): |
| prediction_df = pd.DataFrame(columns=probit_feature_columns, index=[0], dtype=float).fillna(0) |
| for col, value in quote_data.items(): |
| dummy_col = f"{col}_{value}" |
| if dummy_col in prediction_df.columns: |
| prediction_df[dummy_col] = 1 |
| for col, value in quote_data.items(): |
| if col in prediction_df.columns: |
| prediction_df[col] = value |
| prediction_df['const'] = 1 |
| |
| z_score = probit_model.predict(prediction_df[probit_feature_columns], which="linear").iloc[0] |
| pdf = norm.pdf(z_score) |
| cdf = norm.cdf(z_score) |
| return pdf / cdf if cdf > 0 else 0 |
|
|
| def predict_price_with_heckman(new_quote_data, historical_df, probit_model, probit_cols, xgb_model): |
| new_quote_data['log_QUOTE_QTY'] = np.log1p(new_quote_data['QUOTE_QTY']) |
| historical_features = get_historical_features(new_quote_data['product_id'], historical_df) |
| full_quote_data = {**new_quote_data, **historical_features} |
| |
| |
| full_quote_data['final_trade_price'] = np.expm1(full_quote_data.get('logprice_lag1', 0)) |
| full_quote_data['logprice'] = np.log1p(full_quote_data['final_trade_price']) |
| full_quote_data['logproduct'] = full_quote_data['logprice'] * full_quote_data['log_QUOTE_QTY'] |
|
|
| imr_value = calculate_imr(full_quote_data, probit_model, probit_cols) |
| full_quote_data['inverse_mills_ratio'] = imr_value |
| |
| prediction_df_xgb = pd.DataFrame([full_quote_data]) |
| for col in ['FF_REGION', 'product_id']: |
| if col in prediction_df_xgb.columns: |
| prediction_df_xgb[col] = prediction_df_xgb[col].astype('category') |
| |
| predicted_log_price = xgb_model.predict(prediction_df_xgb[xgb_model.feature_names_in_]) |
| return np.expm1(predicted_log_price)[0] |
|
|
| def predict_price_with_heckman_iterative(new_quote_data, historical_df, probit_model, probit_cols, xgb_model, naive_model, iterations=4): |
| """ |
| Predicts the price using a multi-step iterative approach for the IMR. |
| """ |
| |
| new_quote_data['log_QUOTE_QTY'] = np.log1p(new_quote_data['QUOTE_QTY']) |
| historical_features = get_historical_features(new_quote_data['product_id'], historical_df) |
| full_quote_data = {**new_quote_data, **historical_features} |
| |
| |
| prediction_df_naive = pd.DataFrame([full_quote_data]) |
| for col in ['FF_REGION', 'product_id']: |
| if col in prediction_df_naive.columns: |
| prediction_df_naive[col] = prediction_df_naive[col].astype('category') |
| |
| naive_log_price = naive_model.predict(prediction_df_naive[naive_model.feature_names_in_]) |
| current_price_proxy = np.expm1(naive_log_price)[0] |
|
|
| |
|
|
|
|
| |
| |
| |
| imr_value = 0 |
| final_predicted_price = 0 |
|
|
| for _ in range(iterations): |
| |
| probit_input = full_quote_data.copy() |
| |
| |
| probit_input['final_trade_price'] = current_price_proxy |
| probit_input['logprice'] = np.log1p(current_price_proxy) |
| probit_input['logproduct'] = probit_input['logprice'] * probit_input['log_QUOTE_QTY'] |
| |
| imr_value = calculate_imr(probit_input, probit_model, probit_cols) |
| full_quote_data['inverse_mills_ratio'] = imr_value |
| |
| |
| prediction_df_xgb_final = pd.DataFrame([full_quote_data]) |
| for col in ['FF_REGION', 'product_id']: |
| if col in prediction_df_xgb_final.columns: |
| prediction_df_xgb_final[col] = prediction_df_xgb_final[col].astype('category') |
| |
| predicted_log_price = xgb_model.predict(prediction_df_xgb_final[xgb_model.feature_names_in_]) |
| final_predicted_price = np.expm1(predicted_log_price)[0] |
| |
| |
| current_price_proxy = final_predicted_price |
| |
| |
| |
| |
| return final_predicted_price |
|
|
| final_predictions_df = pd.read_parquet("FinalPredictions.parquet") |
|
|
| product_counts = df[df['is_win'] == 1]['product_id'].value_counts() |
| ALL_PRODUCT_IDS = product_counts.index.tolist() |
| tops = [14028934426778, 14029403716778, 14029037911778, 14031459622778, 14029657192778, 14029408741778, 14029714238363, 14028933256778, 14029726163363, 14029189621778, 14029135081778, 14029672028363, 14030192348363, 14029043536778, 14031459623363, 14030358713363, 14029650878363, 14029585530188, 14029617390188, 14029711148363, 14030989013363, 14029499296778, 14029435987778, 14030358728363, 14029286461778, 14029137196778, 14029161241778, 14029769813363, 14029539031733, 14029711147778, 14030487623363, 14029650877778, 14030358788363, 14029408732778, 14029405456733, 14029803323363, 14030675078363, 14030675183363, 14029405471733, 14029870838363, 14029585455623, 14029724033363, 14030280698363, 14029485331778, 14029727738363, 14029137241778, 14029829303363, 14029460477213, 14031251048363, 14029287541778] |
| ALL_PRODUCT_IDS = tops + ALL_PRODUCT_IDS[50:] |
| ALL_REGIONS = sorted(df['FF_REGION'].unique()) |
|
|
| app = dash.Dash(__name__, external_stylesheets=['https://codepen.io/chriddyp/pen/bWLwgP.css']) |
| app.title = "Predictive Pricing Dashboard" |
|
|
| dropdown_options = [] |
| for pid in ALL_PRODUCT_IDS: |
| has_ref_curve = not pricing_df[pricing_df['product-anonymized'] == int(pid)].empty |
| label = f"{pid}{' (*)' if has_ref_curve else ''}" |
| dropdown_options.append({'label': label, 'value': pid}) |
|
|
| app.layout = html.Div( |
| style={'padding': '20px', 'fontFamily': 'Arial, sans-serif'}, |
| children=[ |
| |
| html.H1( |
| "Predictive Pricing Dashboard", |
| style={'textAlign': 'center', 'marginLeft': '-10%'} |
| ), |
| html.P( |
| "Select a product to see its predicted price curves across different regions.", |
| style={'textAlign': 'center', 'marginTop': '0', 'color': 'gray', 'marginLeft': '-10%'} |
| ), |
|
|
| |
| html.Div( |
| style={'display': 'flex', 'gap': '20px', 'padding': '20px'}, |
| children=[ |
|
|
| |
| html.Div( |
| style={'width': '30%', 'backgroundColor': '#f9f9f9', 'padding': '20px', 'borderRadius': '8px'}, |
| children=[ |
| html.H4("Controls", style={'textAlign': 'center', 'marginBottom': '15px'}), |
|
|
| html.Label("Select Product ID:", style={'fontWeight': 'bold'}), |
|
|
| dcc.Dropdown( |
| id='product-id-dropdown', |
| options=dropdown_options, |
| value=ALL_PRODUCT_IDS[1], |
| style={'marginBottom': '20px'} |
| ), |
|
|
|
|
| html.Label("Quantity Scale for Plot:", style={'fontWeight': 'bold'}), |
| dcc.RadioItems( |
| id='scale-radio', |
| options=[ |
| {'label': 'Linear', 'value': 'linear'}, |
| {'label': 'Log', 'value': 'log'} |
| ], |
| value='linear', |
| labelStyle={'display': 'block'}, |
| style={'marginBottom': '20px'} |
| ), |
|
|
| dcc.Checklist( |
| id='show-pricing-df-checklist', |
| options=[{'label': 'Show Reference Price Curve', 'value': 'show_df'}], |
| value=[], |
| style={'marginBottom': '30px'} |
| ), |
|
|
| |
| html.Div(children=[ |
| html.H4("Historical Price Analysis", style={'textAlign': 'center', 'marginBottom': '15px'}), |
|
|
| html.Div( |
| style={'display': 'flex', 'gap': '10px', 'justifyContent': 'center', 'marginBottom': '15px'}, |
| children=[ |
| html.Label("Min Qty:"), |
| dcc.Input(id='min-qty-input', type='number', value=50000, style={'width': '100px'}), |
| html.Label("Max Qty:"), |
| dcc.Input(id='max-qty-input', type='number', value=100000, style={'width': '100px'}) |
| ] |
| ), |
|
|
| html.Table( |
| id='price-analysis-table', |
| className='table', |
| style={'margin': 'auto'} |
| ) |
| ]) |
| ] |
| ), |
|
|
| |
| html.Div( |
| style={'width': '70%', 'textAlign': 'center'}, |
| children=[ |
| html.Div( |
| style={'width': '800px', 'height': '600px', 'margin': 'auto'}, |
| children=[ |
| dcc.Graph(id='price-curve-graph', style={'height': '100%', 'width': '100%'}) |
| ] |
| ) |
| ] |
| ) |
| ] |
| ) |
| ] |
| ) |
|
|
| @app.callback( |
| [Output('price-curve-graph', 'figure'), |
| Output('price-analysis-table', 'children'),], |
| [Input('product-id-dropdown', 'value'), |
| Input('scale-radio', 'value'), |
| Input('show-pricing-df-checklist', 'value'), |
| Input('min-qty-input', 'value'), |
| Input('max-qty-input', 'value')] |
| ) |
| def update_dashboard(selected_product_id, scale_type, show_df_value, min_qty, max_qty): |
| |
| fig = go.Figure() |
| product_curve_data = final_predictions_df[final_predictions_df['product_id'] == selected_product_id] |
|
|
| if not product_curve_data.empty: |
| pivoted_data = product_curve_data.pivot(index='qty', columns='region', values='predicted_price').dropna() |
|
|
| |
| monotonic_pivoted_data = pivoted_data.cummin() |
| |
| |
| p75 = monotonic_pivoted_data.quantile(0.75, axis=1) |
| p25 = monotonic_pivoted_data.quantile(0.25, axis=1) |
| global_curve_median = monotonic_pivoted_data.median(axis=1) |
|
|
| |
| |
| fig.add_trace(go.Scatter( |
| x=p25.index, y=p25, |
| fill=None, mode='lines', line_color='rgba(100, 100, 100, 0.3)', showlegend=False |
| )) |
| fig.add_trace(go.Scatter( |
| x=p75.index, y=p75, |
| fill='tonexty', |
| mode='lines', line_color='rgba(100, 100, 100, 0.3)', |
| name='Interquartile Price Range (25-75%)' |
| )) |
| |
| |
| fig.add_trace(go.Scatter( |
| x=global_curve_median.index, y=global_curve_median, |
| mode='lines', name='Global Median Price Curve', |
| line=dict(color='black', dash='dash', width=2) |
| )) |
|
|
| |
| if 'show_df' in show_df_value: |
| ref_curve_data = pricing_df[pricing_df['product-anonymized'] == int(selected_product_id)] |
| if not ref_curve_data.empty: |
| ref_curve_numeric = ref_curve_data.drop(columns=['product-anonymized', 'DBC'], errors='ignore').T |
| ref_curve_numeric.columns = ['Reference Curve'] |
| ref_curve_numeric.index = pd.to_numeric(ref_curve_numeric.index, errors="coerce") |
| ref_curve_numeric = ref_curve_numeric.dropna().sort_index() |
|
|
| fig.add_trace(go.Scatter( |
| x=ref_curve_numeric.index, |
| y=ref_curve_numeric['Reference Curve'], |
| mode='lines+markers', |
| name='Reference Price Curve', |
| line=dict(color='blue', width=2) |
| )) |
|
|
|
|
| |
| fig.update_layout( |
| title=f"Target Pricing Corridor for Product: {selected_product_id}", |
| xaxis_title="Quote Quantity", |
| yaxis_title="Predicted Final Trade Price (USD)", |
| legend_title="Price Range", |
| template="plotly_white", |
| xaxis_type=scale_type.lower() |
| ) |
| |
| |
| |
| now = datetime.now() |
| lookback_periods = { |
| "Last 6 Months": now - relativedelta(months=6), |
| "Last 1 Year": now - relativedelta(years=1), |
| "Last 2 Years": now - relativedelta(years=2), |
| "All Time": df_accepted['QUOTE_ITEM_CREATE_DATE'].min() |
| } |
| |
| table_data = [] |
| for period_name, start_date in lookback_periods.items(): |
| hist_filtered = df_accepted[ |
| (df_accepted['product_id'] == str(selected_product_id)) & |
| (df_accepted['QUOTE_QTY'] >= min_qty) & |
| (df_accepted['QUOTE_QTY'] <= max_qty) & |
| (df_accepted['QUOTE_ITEM_CREATE_DATE'] >= start_date) |
| ] |
| |
| if not hist_filtered.empty: |
| avg_actual_price = hist_filtered['final_trade_price'].mean() |
| avg_predicted_price = hist_filtered['predicted_price'].mean() |
| num_trades = len(hist_filtered) |
| |
| table_data.append({ |
| "Lookback Period": period_name, |
| "Avg. Actual Price": f"${avg_actual_price:.2f}", |
| "Avg. Predicted Price": f"${avg_predicted_price:.2f}", |
| "Number of Trades": num_trades |
| }) |
|
|
| table_header = [html.Thead(html.Tr([html.Th(col) for col in ["Lookback Period", "Avg. Actual Price", "Avg. Predicted Price", "Number of Trades"]]))] |
| table_body = [html.Tbody([html.Tr([html.Td(row[col]) for col in row]) for row in table_data])] |
|
|
| |
| return fig, table_header + table_body |
| |
|
|
| server = app.server |
| if __name__ == '__main__': |
| app.run(host='0.0.0.0', port=7860) |
|
|