Kiuyha's picture
Upload app.py with huggingface_hub
5ae8501 verified
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import plotly.express as px
import re
import os
from datetime import datetime
from nltk.corpus import stopwords
import nltk
st.set_page_config(page_title="Tokopedia Price Predictor", layout="wide")
ARTIFACTS_DIR = 'artifacts'
@st.cache_resource
def load_resources():
try:
nltk.download('stopwords')
indo_stopwords = set(stopwords.words('indonesian'))
custom_stopwords = {
'yg', 'dg', 'dgn', 'ny', 'nya', 'kalo', 'klo', 'gan', 'sis', 'kak', 'bro',
'bosh', 'boss', 'nomor', 'wa', 'hub', 'cod', 'ready', 'stock', 'stok',
'promo', 'murah', 'diskon', 'termurah', 'terlaris', 'bestseller'
}
final_stopwords = indo_stopwords.union(custom_stopwords)
vectorizer = joblib.load(os.path.join(ARTIFACTS_DIR, 'tfidf_vectorizer.pkl'))
price_preprocess = joblib.load(os.path.join(ARTIFACTS_DIR, 'price_preprocessor.pkl'))
price_cols = joblib.load(os.path.join(ARTIFACTS_DIR, 'price_columns.pkl'))
price_models = {
"Random Forest": joblib.load(os.path.join(ARTIFACTS_DIR, 'Random_Forest_price.pkl')),
"XGBoost": joblib.load(os.path.join(ARTIFACTS_DIR, 'XGBoost_price.pkl')),
"MLP": joblib.load(os.path.join(ARTIFACTS_DIR, 'MLP_price.pkl'))
}
return vectorizer, price_preprocess, price_cols, price_models, final_stopwords
except Exception as e:
st.error(f"Error loading resources: {e}")
return None, None, None, None, None
(vectorizer, price_preprocess, price_cols, price_models, final_stopwords) = load_resources()
@st.cache_data
def get_categories():
columns = ['Other']
if price_cols:
cat_cols = [c for c in price_cols if c.startswith('cat_')]
columns = [c.replace('cat_', '') for c in cat_cols]
return sorted(columns)
def clean_text(text):
if not isinstance(text, str): return ""
text = text.lower()
text = re.sub(r'[^a-z0-9\s]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
words = text.split()
return " ".join([w for w in words if w not in final_stopwords])
def get_date_features(date_obj, prefix):
return {
f"{prefix}_dayofweek": date_obj.weekday(),
f"{prefix}_month": date_obj.month,
f"{prefix}_is_weekend": 1 if date_obj.weekday() >= 5 else 0
}
def prepare_raw_dataframe(user_input, target_columns, vectorizer):
input_df = pd.DataFrame(0, index=[0], columns=target_columns)
combined_text = clean_text(user_input.get('product_name', '')) + " " + clean_text(user_input.get('description', ''))
tfidf_matrix = vectorizer.transform([combined_text])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f"word_{w}" for w in vectorizer.get_feature_names_out()])
common_text_cols = input_df.columns.intersection(tfidf_df.columns)
input_df[common_text_cols] = tfidf_df[common_text_cols]
cat_col = f"cat_{user_input.get('category', 'Other')}"
if cat_col in input_df.columns:
input_df[cat_col] = 1
elif 'cat_Other' in input_df.columns:
input_df['cat_Other'] = 1
for key, value in user_input.items():
if key in input_df.columns:
input_df[key] = value
return input_df
def load_and_plot_importance(model_name, target_name):
filename = f"importance_data_{target_name}_{model_name.replace(' ', '_')}.csv"
path = os.path.join(ARTIFACTS_DIR, filename)
if os.path.exists(path):
df = pd.read_csv(path)
fig = px.bar(
df.head(15), x='Importance', y='Feature', orientation='h', error_x='Std',
title=f"Feature Importance ({model_name})", color='Importance'
)
fig.update_layout(yaxis=dict(autorange="reversed"))
st.plotly_chart(fig)
st.title("🛍️ Advanced E-Commerce Price Predictor")
with st.container():
c1, c2 = st.columns([2, 1])
with c1:
p_name = st.text_input("Product Name", "Jam tangan Automatic SF PS3/03 CCO Orange Rubber NFC Aktif fullset")
p_desc = st.text_area("Description", 'SF PS3/03 aka "CCO" Kaca : Crystal Diameter : 47mm Weight : 138gr Case : Stainless Bezel : Stainless Strap : Rubber Buckle : Tang Movement : Automatic Japan Miyota 82A7 Quality : Clone 1:1 Function : Hour, Minute, Second, 24hour Indicator, Automatic. Include box fullset & extra rubber strap. Note : Untuk mencegah hal hal yg tidak diinginkan, harap membuat VIDEO UNBOXING sebelum membuka paket utk membantu jika diperlukannya klaim kerusakan atau kekurangan barang. Terima kasih')
with c2:
categories_list = get_categories()
category = st.selectbox("Category", categories_list, index=categories_list.index('Fashion Pria'))
shop_tier = st.selectbox(
"Shop Tier",
["Regular Merchant", "Power Merchant", "Official Store"]
)
condition = st.radio("Condition", ["New", "Used"], horizontal=True)
c_left, c_right = st.columns(2)
with c_left:
with st.expander("📦 Physical, Stock & Shipping", expanded=True):
col1, col2, col3 = st.columns(3)
weight = col1.number_input("Weight (g)", value=1000)
stock = col2.number_input("Stock", value=5)
sold = col3.number_input("Sold", value=0)
min_order = col1.number_input("Min Order", value=1)
max_order = col2.number_input("Max Order", value=5)
st.caption("Shipping Options")
sc1, sc2, sc3 = st.columns(3)
ship_inst = sc1.checkbox("Instant", False)
ship_same = sc2.checkbox("Same Day", False)
ship_next = sc3.checkbox("Next Day", False)
ship_reg = sc1.checkbox("Regular", True)
ship_cargo = sc2.checkbox("Cargo", False)
ship_eco = sc3.checkbox("Economy", False)
total_shipping = st.number_input("Total Shipping Types", value=2)
is_preorder = st.checkbox("Preorder?", False)
is_discount = st.checkbox("Discounted?", False)
with st.expander("🏪 Shop Performance", expanded=False):
shop_age = st.number_input("Shop Age (Days)", 365)
shop_pop = st.number_input("Shop Popularity Score", 100)
shop_city_pop = st.number_input("City Popularity", 1)
resp_time = st.number_input("Response Time (mins)", 420)
date_shop_open = st.date_input("Shop Open Since", pd.to_datetime(1662746294))
with st.expander("⚙️ Advanced Shop Setup (Detailed Ratings)", expanded=False):
st.caption("Input exact review counts for the entire shop")
ac1, ac2, ac3, ac4, ac5 = st.columns(5)
shop_r5 = ac1.number_input("Shop 5★", value=31)
shop_r4 = ac2.number_input("Shop 4★", value=3)
shop_r3 = ac3.number_input("Shop 3★", value=0)
shop_r2 = ac4.number_input("Shop 2★", value=0)
shop_r1 = ac5.number_input("Shop 1★", value=0)
shop_total_rating = shop_r5 + shop_r4 + shop_r3 + shop_r2 + shop_r1
weighted_shop_sum = (shop_r5*5 + shop_r4*4 + shop_r3*3 + shop_r2*2 + shop_r1*1)
shop_rating_score = weighted_shop_sum / shop_total_rating if shop_total_rating > 0 else 0.0
st.text(f"Calculated Total Ratings: {shop_total_rating}")
st.text(f"Calculated Avg Score: {shop_rating_score:.2f}")
with c_right:
with st.expander("⭐ Product Ratings & Reviews", expanded=True):
c1, c2, c3, c4, c5 = st.columns(5)
r5 = c1.number_input("Prod 5★", value=0)
r4 = c2.number_input("Prod 4★", value=0)
r3 = c3.number_input("Prod 3★", value=0)
r2 = c4.number_input("Prod 2★", value=0)
r1 = c5.number_input("Prod 1★", value=0)
total_rating_count = r5 + r4 + r3 + r2 + r1
weighted_sum = (r5*5 + r4*4 + r3*3 + r2*2 + r1*1)
avg_rating = weighted_sum / total_rating_count if total_rating_count > 0 else 0.0
st.info(f"Avg Rating: {avg_rating:.2f} ({total_rating_count} ratings)")
total_reviews = st.number_input("Total Written Reviews", value=0)
reviews_w_img = st.number_input("Reviews w/ Images", value=0)
satisfaction = st.slider("Buyer Satisfaction %", 0, 100, 0)
with st.expander("💬 Count Review Topics", expanded=False):
tc1, tc2 = st.columns(2)
rev_qual = tc1.number_input("Quality", 0)
rev_srv = tc2.number_input("Service", 0)
rev_pack = tc1.number_input("Packaging", 0)
rev_price = tc2.number_input("Price", 0)
rev_desc = tc1.number_input("Description", 0)
rev_ship = tc2.number_input("Shipping", 0)
with st.expander("📸 Media & Listing Date", expanded=False):
mc1, mc2 = st.columns(2)
vid_count = mc1.number_input("Videos", 1)
img_count = mc2.number_input("Images", 4)
date_listing = st.date_input("Listing Created", pd.to_datetime('2024-09-10T00:29:59+07:00'))
feat_created = get_date_features(date_listing, "created_at")
feat_shop = get_date_features(date_shop_open, "shop_open_since")
listing_age_calc = (datetime.now().date() - date_listing).days
base_input = {
'product_name': p_name, 'description': p_desc, 'category': category,
'condition_encoded': 1 if condition == 'New' else 0,
'weight_grams': weight, 'min_order': min_order, 'max_order': max_order,
'is_preorder': 1 if is_preorder else 0,
'is_discount': 1 if is_discount else 0,
'is_regular_merchant': 1 if shop_tier == "Regular Merchant" else 0,
'is_power_merchant': 1 if shop_tier == "Power Merchant" else 0,
'is_official_store': 1 if shop_tier == "Official Store" else 0,
'video_count': vid_count,
'image_count': img_count,
'can_shipping_instant': 1 if ship_inst else 0,
'can_shipping_sameday': 1 if ship_same else 0,
'can_shipping_next_day': 1 if ship_next else 0,
'can_shipping_regular': 1 if ship_reg else 0,
'can_shipping_cargo': 1 if ship_cargo else 0,
'can_shipping_economy': 1 if ship_eco else 0,
'total_shipping_types': total_shipping,
'rating': avg_rating,
'total_product_rating': total_rating_count,
'total_product_review': total_reviews,
'total_product_review_with_image': reviews_w_img,
'buyer_satisfaction_percentage': satisfaction,
'product_rating_5_star_count': r5,
'product_rating_4_star_count': r4,
'product_rating_3_star_count': r3,
'product_rating_2_star_count': r2,
'product_rating_1_star_count': r1,
'total_review_about_kualitas': rev_qual,
'total_review_about_pelayanan': rev_srv,
'total_review_about_kemasan': rev_pack,
'total_review_about_harga': rev_price,
'total_review_about_sesuai deskripsi': rev_desc,
'total_review_about_pengiriman': rev_ship,
'shop_rating_score': shop_rating_score,
'shop_total_rating': shop_total_rating,
'shop_rating_5': shop_r5,
'shop_rating_4': shop_r4,
'shop_rating_3': shop_r3,
'shop_rating_2': shop_r2,
'shop_rating_1': shop_r1,
'shop_response_time_mins': resp_time,
'shop_age_days': shop_age,
'listing_age_days': listing_age_calc,
'shop_popularity': shop_pop,
'shop_city_popularity': shop_city_pop,
**feat_created, **feat_shop
}
base_input['Log_stock'] = np.log1p(stock)
base_input['Log_sold'] = np.log1p(sold)
st.divider()
st.subheader("💰 Price Prediction")
if price_models:
model_name = st.selectbox("Select Model", list(price_models.keys()), label_visibility="collapsed")
predict_btn = st.button("Predict Price", type="primary", use_container_width=True)
if predict_btn:
raw_df = prepare_raw_dataframe(base_input, price_cols, vectorizer)
missing_cols = set(price_cols) - set(raw_df.columns)
for c in missing_cols:
raw_df[c] = 0
raw_df = raw_df[price_cols]
X_scaled = price_preprocess.transform(raw_df)
log_pred = price_models[model_name].predict(X_scaled)[0]
price_pred = np.expm1(log_pred)
st.success(f"Estimated Price: **Rp {price_pred:,.0f}**")
with st.expander("Feature Importance Analysis", expanded=True):
load_and_plot_importance(model_name, "price")
else:
st.error("Models failed to load.")