Spaces:

Kiuyha
/

Price-Ecommerce-Predictor

Sleeping

App Files Files Community

Price-Ecommerce-Predictor / app.py

Kiuyha

Upload app.py with huggingface_hub

5ae8501 verified 25 days ago

raw

history blame contribute delete

12.2 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import joblib
	import plotly.express as px
	import re
	import os
	from datetime import datetime
	from nltk.corpus import stopwords
	import nltk

	st.set_page_config(page_title="Tokopedia Price Predictor", layout="wide")

	ARTIFACTS_DIR = 'artifacts'

	@st.cache_resource
	def load_resources():
	try:
	nltk.download('stopwords')
	indo_stopwords = set(stopwords.words('indonesian'))
	custom_stopwords = {
	'yg', 'dg', 'dgn', 'ny', 'nya', 'kalo', 'klo', 'gan', 'sis', 'kak', 'bro',
	'bosh', 'boss', 'nomor', 'wa', 'hub', 'cod', 'ready', 'stock', 'stok',
	'promo', 'murah', 'diskon', 'termurah', 'terlaris', 'bestseller'
	}
	final_stopwords = indo_stopwords.union(custom_stopwords)

	vectorizer = joblib.load(os.path.join(ARTIFACTS_DIR, 'tfidf_vectorizer.pkl'))
	price_preprocess = joblib.load(os.path.join(ARTIFACTS_DIR, 'price_preprocessor.pkl'))
	price_cols = joblib.load(os.path.join(ARTIFACTS_DIR, 'price_columns.pkl'))

	price_models = {
	"Random Forest": joblib.load(os.path.join(ARTIFACTS_DIR, 'Random_Forest_price.pkl')),
	"XGBoost": joblib.load(os.path.join(ARTIFACTS_DIR, 'XGBoost_price.pkl')),
	"MLP": joblib.load(os.path.join(ARTIFACTS_DIR, 'MLP_price.pkl'))
	}

	return vectorizer, price_preprocess, price_cols, price_models, final_stopwords
	except Exception as e:
	st.error(f"Error loading resources: {e}")
	return None, None, None, None, None

	(vectorizer, price_preprocess, price_cols, price_models, final_stopwords) = load_resources()

	@st.cache_data
	def get_categories():
	columns = ['Other']
	if price_cols:
	cat_cols = [c for c in price_cols if c.startswith('cat_')]
	columns = [c.replace('cat_', '') for c in cat_cols]
	return sorted(columns)

	def clean_text(text):
	if not isinstance(text, str): return ""
	text = text.lower()
	text = re.sub(r'[^a-z0-9\s]', ' ', text)
	text = re.sub(r'\s+', ' ', text).strip()
	words = text.split()
	return " ".join([w for w in words if w not in final_stopwords])

	def get_date_features(date_obj, prefix):
	return {
	f"{prefix}_dayofweek": date_obj.weekday(),
	f"{prefix}_month": date_obj.month,
	f"{prefix}_is_weekend": 1 if date_obj.weekday() >= 5 else 0
	}

	def prepare_raw_dataframe(user_input, target_columns, vectorizer):
	input_df = pd.DataFrame(0, index=[0], columns=target_columns)

	combined_text = clean_text(user_input.get('product_name', '')) + " " + clean_text(user_input.get('description', ''))
	tfidf_matrix = vectorizer.transform([combined_text])
	tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f"word_{w}" for w in vectorizer.get_feature_names_out()])

	common_text_cols = input_df.columns.intersection(tfidf_df.columns)
	input_df[common_text_cols] = tfidf_df[common_text_cols]

	cat_col = f"cat_{user_input.get('category', 'Other')}"
	if cat_col in input_df.columns:
	input_df[cat_col] = 1
	elif 'cat_Other' in input_df.columns:
	input_df['cat_Other'] = 1

	for key, value in user_input.items():
	if key in input_df.columns:
	input_df[key] = value

	return input_df

	def load_and_plot_importance(model_name, target_name):
	filename = f"importance_data_{target_name}_{model_name.replace(' ', '_')}.csv"
	path = os.path.join(ARTIFACTS_DIR, filename)
	if os.path.exists(path):
	df = pd.read_csv(path)
	fig = px.bar(
	df.head(15), x='Importance', y='Feature', orientation='h', error_x='Std',
	title=f"Feature Importance ({model_name})", color='Importance'
	)
	fig.update_layout(yaxis=dict(autorange="reversed"))
	st.plotly_chart(fig)

	st.title("🛍️ Advanced E-Commerce Price Predictor")

	with st.container():
	c1, c2 = st.columns([2, 1])
	with c1:
	p_name = st.text_input("Product Name", "Jam tangan Automatic SF PS3/03 CCO Orange Rubber NFC Aktif fullset")
	p_desc = st.text_area("Description", 'SF PS3/03 aka "CCO" Kaca : Crystal Diameter : 47mm Weight : 138gr Case : Stainless Bezel : Stainless Strap : Rubber Buckle : Tang Movement : Automatic Japan Miyota 82A7 Quality : Clone 1:1 Function : Hour, Minute, Second, 24hour Indicator, Automatic. Include box fullset & extra rubber strap. Note : Untuk mencegah hal hal yg tidak diinginkan, harap membuat VIDEO UNBOXING sebelum membuka paket utk membantu jika diperlukannya klaim kerusakan atau kekurangan barang. Terima kasih')
	with c2:
	categories_list = get_categories()
	category = st.selectbox("Category", categories_list, index=categories_list.index('Fashion Pria'))
	shop_tier = st.selectbox(
	"Shop Tier",
	["Regular Merchant", "Power Merchant", "Official Store"]
	)
	condition = st.radio("Condition", ["New", "Used"], horizontal=True)

	c_left, c_right = st.columns(2)

	with c_left:
	with st.expander("📦 Physical, Stock & Shipping", expanded=True):
	col1, col2, col3 = st.columns(3)
	weight = col1.number_input("Weight (g)", value=1000)
	stock = col2.number_input("Stock", value=5)
	sold = col3.number_input("Sold", value=0)

	min_order = col1.number_input("Min Order", value=1)
	max_order = col2.number_input("Max Order", value=5)

	st.caption("Shipping Options")
	sc1, sc2, sc3 = st.columns(3)
	ship_inst = sc1.checkbox("Instant", False)
	ship_same = sc2.checkbox("Same Day", False)
	ship_next = sc3.checkbox("Next Day", False)
	ship_reg = sc1.checkbox("Regular", True)
	ship_cargo = sc2.checkbox("Cargo", False)
	ship_eco = sc3.checkbox("Economy", False)
	total_shipping = st.number_input("Total Shipping Types", value=2)

	is_preorder = st.checkbox("Preorder?", False)
	is_discount = st.checkbox("Discounted?", False)

	with st.expander("🏪 Shop Performance", expanded=False):
	shop_age = st.number_input("Shop Age (Days)", 365)
	shop_pop = st.number_input("Shop Popularity Score", 100)
	shop_city_pop = st.number_input("City Popularity", 1)
	resp_time = st.number_input("Response Time (mins)", 420)
	date_shop_open = st.date_input("Shop Open Since", pd.to_datetime(1662746294))

	with st.expander("⚙️ Advanced Shop Setup (Detailed Ratings)", expanded=False):
	st.caption("Input exact review counts for the entire shop")
	ac1, ac2, ac3, ac4, ac5 = st.columns(5)
	shop_r5 = ac1.number_input("Shop 5★", value=31)
	shop_r4 = ac2.number_input("Shop 4★", value=3)
	shop_r3 = ac3.number_input("Shop 3★", value=0)
	shop_r2 = ac4.number_input("Shop 2★", value=0)
	shop_r1 = ac5.number_input("Shop 1★", value=0)

	shop_total_rating = shop_r5 + shop_r4 + shop_r3 + shop_r2 + shop_r1

	weighted_shop_sum = (shop_r55 + shop_r44 + shop_r33 + shop_r22 + shop_r1*1)
	shop_rating_score = weighted_shop_sum / shop_total_rating if shop_total_rating > 0 else 0.0

	st.text(f"Calculated Total Ratings: {shop_total_rating}")
	st.text(f"Calculated Avg Score: {shop_rating_score:.2f}")

	with c_right:
	with st.expander("⭐ Product Ratings & Reviews", expanded=True):
	c1, c2, c3, c4, c5 = st.columns(5)
	r5 = c1.number_input("Prod 5★", value=0)
	r4 = c2.number_input("Prod 4★", value=0)
	r3 = c3.number_input("Prod 3★", value=0)
	r2 = c4.number_input("Prod 2★", value=0)
	r1 = c5.number_input("Prod 1★", value=0)

	total_rating_count = r5 + r4 + r3 + r2 + r1
	weighted_sum = (r55 + r44 + r33 + r22 + r1*1)
	avg_rating = weighted_sum / total_rating_count if total_rating_count > 0 else 0.0
	st.info(f"Avg Rating: {avg_rating:.2f} ({total_rating_count} ratings)")

	total_reviews = st.number_input("Total Written Reviews", value=0)
	reviews_w_img = st.number_input("Reviews w/ Images", value=0)
	satisfaction = st.slider("Buyer Satisfaction %", 0, 100, 0)

	with st.expander("💬 Count Review Topics", expanded=False):
	tc1, tc2 = st.columns(2)
	rev_qual = tc1.number_input("Quality", 0)
	rev_srv = tc2.number_input("Service", 0)
	rev_pack = tc1.number_input("Packaging", 0)
	rev_price = tc2.number_input("Price", 0)
	rev_desc = tc1.number_input("Description", 0)
	rev_ship = tc2.number_input("Shipping", 0)

	with st.expander("📸 Media & Listing Date", expanded=False):
	mc1, mc2 = st.columns(2)
	vid_count = mc1.number_input("Videos", 1)
	img_count = mc2.number_input("Images", 4)
	date_listing = st.date_input("Listing Created", pd.to_datetime('2024-09-10T00:29:59+07:00'))

	feat_created = get_date_features(date_listing, "created_at")
	feat_shop = get_date_features(date_shop_open, "shop_open_since")
	listing_age_calc = (datetime.now().date() - date_listing).days

	base_input = {
	'product_name': p_name, 'description': p_desc, 'category': category,
	'condition_encoded': 1 if condition == 'New' else 0,
	'weight_grams': weight, 'min_order': min_order, 'max_order': max_order,

	'is_preorder': 1 if is_preorder else 0,
	'is_discount': 1 if is_discount else 0,
	'is_regular_merchant': 1 if shop_tier == "Regular Merchant" else 0,
	'is_power_merchant': 1 if shop_tier == "Power Merchant" else 0,
	'is_official_store': 1 if shop_tier == "Official Store" else 0,

	'video_count': vid_count,
	'image_count': img_count,

	'can_shipping_instant': 1 if ship_inst else 0,
	'can_shipping_sameday': 1 if ship_same else 0,
	'can_shipping_next_day': 1 if ship_next else 0,
	'can_shipping_regular': 1 if ship_reg else 0,
	'can_shipping_cargo': 1 if ship_cargo else 0,
	'can_shipping_economy': 1 if ship_eco else 0,
	'total_shipping_types': total_shipping,

	'rating': avg_rating,
	'total_product_rating': total_rating_count,
	'total_product_review': total_reviews,
	'total_product_review_with_image': reviews_w_img,
	'buyer_satisfaction_percentage': satisfaction,
	'product_rating_5_star_count': r5,
	'product_rating_4_star_count': r4,
	'product_rating_3_star_count': r3,
	'product_rating_2_star_count': r2,
	'product_rating_1_star_count': r1,

	'total_review_about_kualitas': rev_qual,
	'total_review_about_pelayanan': rev_srv,
	'total_review_about_kemasan': rev_pack,
	'total_review_about_harga': rev_price,
	'total_review_about_sesuai deskripsi': rev_desc,
	'total_review_about_pengiriman': rev_ship,

	'shop_rating_score': shop_rating_score,
	'shop_total_rating': shop_total_rating,
	'shop_rating_5': shop_r5,
	'shop_rating_4': shop_r4,
	'shop_rating_3': shop_r3,
	'shop_rating_2': shop_r2,
	'shop_rating_1': shop_r1,
	'shop_response_time_mins': resp_time,
	'shop_age_days': shop_age,
	'listing_age_days': listing_age_calc,
	'shop_popularity': shop_pop,
	'shop_city_popularity': shop_city_pop,

	feat_created, feat_shop
	}

	base_input['Log_stock'] = np.log1p(stock)
	base_input['Log_sold'] = np.log1p(sold)

	st.divider()
	st.subheader("💰 Price Prediction")

	if price_models:
	model_name = st.selectbox("Select Model", list(price_models.keys()), label_visibility="collapsed")
	predict_btn = st.button("Predict Price", type="primary", use_container_width=True)

	if predict_btn:
	raw_df = prepare_raw_dataframe(base_input, price_cols, vectorizer)

	missing_cols = set(price_cols) - set(raw_df.columns)
	for c in missing_cols:
	raw_df[c] = 0
	raw_df = raw_df[price_cols]

	X_scaled = price_preprocess.transform(raw_df)
	log_pred = price_models[model_name].predict(X_scaled)[0]
	price_pred = np.expm1(log_pred)

	st.success(f"Estimated Price: Rp {price_pred:,.0f}")

	with st.expander("Feature Importance Analysis", expanded=True):
	load_and_plot_importance(model_name, "price")
	else:
	st.error("Models failed to load.")