Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| import plotly.express as px | |
| import re | |
| import os | |
| from datetime import datetime | |
| from nltk.corpus import stopwords | |
| import nltk | |
| st.set_page_config(page_title="Tokopedia Price Predictor", layout="wide") | |
| ARTIFACTS_DIR = 'artifacts' | |
| def load_resources(): | |
| try: | |
| nltk.download('stopwords') | |
| indo_stopwords = set(stopwords.words('indonesian')) | |
| custom_stopwords = { | |
| 'yg', 'dg', 'dgn', 'ny', 'nya', 'kalo', 'klo', 'gan', 'sis', 'kak', 'bro', | |
| 'bosh', 'boss', 'nomor', 'wa', 'hub', 'cod', 'ready', 'stock', 'stok', | |
| 'promo', 'murah', 'diskon', 'termurah', 'terlaris', 'bestseller' | |
| } | |
| final_stopwords = indo_stopwords.union(custom_stopwords) | |
| vectorizer = joblib.load(os.path.join(ARTIFACTS_DIR, 'tfidf_vectorizer.pkl')) | |
| price_preprocess = joblib.load(os.path.join(ARTIFACTS_DIR, 'price_preprocessor.pkl')) | |
| price_cols = joblib.load(os.path.join(ARTIFACTS_DIR, 'price_columns.pkl')) | |
| price_models = { | |
| "Random Forest": joblib.load(os.path.join(ARTIFACTS_DIR, 'Random_Forest_price.pkl')), | |
| "XGBoost": joblib.load(os.path.join(ARTIFACTS_DIR, 'XGBoost_price.pkl')), | |
| "MLP": joblib.load(os.path.join(ARTIFACTS_DIR, 'MLP_price.pkl')) | |
| } | |
| return vectorizer, price_preprocess, price_cols, price_models, final_stopwords | |
| except Exception as e: | |
| st.error(f"Error loading resources: {e}") | |
| return None, None, None, None, None | |
| (vectorizer, price_preprocess, price_cols, price_models, final_stopwords) = load_resources() | |
| def get_categories(): | |
| columns = ['Other'] | |
| if price_cols: | |
| cat_cols = [c for c in price_cols if c.startswith('cat_')] | |
| columns = [c.replace('cat_', '') for c in cat_cols] | |
| return sorted(columns) | |
| def clean_text(text): | |
| if not isinstance(text, str): return "" | |
| text = text.lower() | |
| text = re.sub(r'[^a-z0-9\s]', ' ', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| words = text.split() | |
| return " ".join([w for w in words if w not in final_stopwords]) | |
| def get_date_features(date_obj, prefix): | |
| return { | |
| f"{prefix}_dayofweek": date_obj.weekday(), | |
| f"{prefix}_month": date_obj.month, | |
| f"{prefix}_is_weekend": 1 if date_obj.weekday() >= 5 else 0 | |
| } | |
| def prepare_raw_dataframe(user_input, target_columns, vectorizer): | |
| input_df = pd.DataFrame(0, index=[0], columns=target_columns) | |
| combined_text = clean_text(user_input.get('product_name', '')) + " " + clean_text(user_input.get('description', '')) | |
| tfidf_matrix = vectorizer.transform([combined_text]) | |
| tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f"word_{w}" for w in vectorizer.get_feature_names_out()]) | |
| common_text_cols = input_df.columns.intersection(tfidf_df.columns) | |
| input_df[common_text_cols] = tfidf_df[common_text_cols] | |
| cat_col = f"cat_{user_input.get('category', 'Other')}" | |
| if cat_col in input_df.columns: | |
| input_df[cat_col] = 1 | |
| elif 'cat_Other' in input_df.columns: | |
| input_df['cat_Other'] = 1 | |
| for key, value in user_input.items(): | |
| if key in input_df.columns: | |
| input_df[key] = value | |
| return input_df | |
| def load_and_plot_importance(model_name, target_name): | |
| filename = f"importance_data_{target_name}_{model_name.replace(' ', '_')}.csv" | |
| path = os.path.join(ARTIFACTS_DIR, filename) | |
| if os.path.exists(path): | |
| df = pd.read_csv(path) | |
| fig = px.bar( | |
| df.head(15), x='Importance', y='Feature', orientation='h', error_x='Std', | |
| title=f"Feature Importance ({model_name})", color='Importance' | |
| ) | |
| fig.update_layout(yaxis=dict(autorange="reversed")) | |
| st.plotly_chart(fig) | |
| st.title("🛍️ Advanced E-Commerce Price Predictor") | |
| with st.container(): | |
| c1, c2 = st.columns([2, 1]) | |
| with c1: | |
| p_name = st.text_input("Product Name", "Jam tangan Automatic SF PS3/03 CCO Orange Rubber NFC Aktif fullset") | |
| p_desc = st.text_area("Description", 'SF PS3/03 aka "CCO" Kaca : Crystal Diameter : 47mm Weight : 138gr Case : Stainless Bezel : Stainless Strap : Rubber Buckle : Tang Movement : Automatic Japan Miyota 82A7 Quality : Clone 1:1 Function : Hour, Minute, Second, 24hour Indicator, Automatic. Include box fullset & extra rubber strap. Note : Untuk mencegah hal hal yg tidak diinginkan, harap membuat VIDEO UNBOXING sebelum membuka paket utk membantu jika diperlukannya klaim kerusakan atau kekurangan barang. Terima kasih') | |
| with c2: | |
| categories_list = get_categories() | |
| category = st.selectbox("Category", categories_list, index=categories_list.index('Fashion Pria')) | |
| shop_tier = st.selectbox( | |
| "Shop Tier", | |
| ["Regular Merchant", "Power Merchant", "Official Store"] | |
| ) | |
| condition = st.radio("Condition", ["New", "Used"], horizontal=True) | |
| c_left, c_right = st.columns(2) | |
| with c_left: | |
| with st.expander("📦 Physical, Stock & Shipping", expanded=True): | |
| col1, col2, col3 = st.columns(3) | |
| weight = col1.number_input("Weight (g)", value=1000) | |
| stock = col2.number_input("Stock", value=5) | |
| sold = col3.number_input("Sold", value=0) | |
| min_order = col1.number_input("Min Order", value=1) | |
| max_order = col2.number_input("Max Order", value=5) | |
| st.caption("Shipping Options") | |
| sc1, sc2, sc3 = st.columns(3) | |
| ship_inst = sc1.checkbox("Instant", False) | |
| ship_same = sc2.checkbox("Same Day", False) | |
| ship_next = sc3.checkbox("Next Day", False) | |
| ship_reg = sc1.checkbox("Regular", True) | |
| ship_cargo = sc2.checkbox("Cargo", False) | |
| ship_eco = sc3.checkbox("Economy", False) | |
| total_shipping = st.number_input("Total Shipping Types", value=2) | |
| is_preorder = st.checkbox("Preorder?", False) | |
| is_discount = st.checkbox("Discounted?", False) | |
| with st.expander("🏪 Shop Performance", expanded=False): | |
| shop_age = st.number_input("Shop Age (Days)", 365) | |
| shop_pop = st.number_input("Shop Popularity Score", 100) | |
| shop_city_pop = st.number_input("City Popularity", 1) | |
| resp_time = st.number_input("Response Time (mins)", 420) | |
| date_shop_open = st.date_input("Shop Open Since", pd.to_datetime(1662746294)) | |
| with st.expander("⚙️ Advanced Shop Setup (Detailed Ratings)", expanded=False): | |
| st.caption("Input exact review counts for the entire shop") | |
| ac1, ac2, ac3, ac4, ac5 = st.columns(5) | |
| shop_r5 = ac1.number_input("Shop 5★", value=31) | |
| shop_r4 = ac2.number_input("Shop 4★", value=3) | |
| shop_r3 = ac3.number_input("Shop 3★", value=0) | |
| shop_r2 = ac4.number_input("Shop 2★", value=0) | |
| shop_r1 = ac5.number_input("Shop 1★", value=0) | |
| shop_total_rating = shop_r5 + shop_r4 + shop_r3 + shop_r2 + shop_r1 | |
| weighted_shop_sum = (shop_r5*5 + shop_r4*4 + shop_r3*3 + shop_r2*2 + shop_r1*1) | |
| shop_rating_score = weighted_shop_sum / shop_total_rating if shop_total_rating > 0 else 0.0 | |
| st.text(f"Calculated Total Ratings: {shop_total_rating}") | |
| st.text(f"Calculated Avg Score: {shop_rating_score:.2f}") | |
| with c_right: | |
| with st.expander("⭐ Product Ratings & Reviews", expanded=True): | |
| c1, c2, c3, c4, c5 = st.columns(5) | |
| r5 = c1.number_input("Prod 5★", value=0) | |
| r4 = c2.number_input("Prod 4★", value=0) | |
| r3 = c3.number_input("Prod 3★", value=0) | |
| r2 = c4.number_input("Prod 2★", value=0) | |
| r1 = c5.number_input("Prod 1★", value=0) | |
| total_rating_count = r5 + r4 + r3 + r2 + r1 | |
| weighted_sum = (r5*5 + r4*4 + r3*3 + r2*2 + r1*1) | |
| avg_rating = weighted_sum / total_rating_count if total_rating_count > 0 else 0.0 | |
| st.info(f"Avg Rating: {avg_rating:.2f} ({total_rating_count} ratings)") | |
| total_reviews = st.number_input("Total Written Reviews", value=0) | |
| reviews_w_img = st.number_input("Reviews w/ Images", value=0) | |
| satisfaction = st.slider("Buyer Satisfaction %", 0, 100, 0) | |
| with st.expander("💬 Count Review Topics", expanded=False): | |
| tc1, tc2 = st.columns(2) | |
| rev_qual = tc1.number_input("Quality", 0) | |
| rev_srv = tc2.number_input("Service", 0) | |
| rev_pack = tc1.number_input("Packaging", 0) | |
| rev_price = tc2.number_input("Price", 0) | |
| rev_desc = tc1.number_input("Description", 0) | |
| rev_ship = tc2.number_input("Shipping", 0) | |
| with st.expander("📸 Media & Listing Date", expanded=False): | |
| mc1, mc2 = st.columns(2) | |
| vid_count = mc1.number_input("Videos", 1) | |
| img_count = mc2.number_input("Images", 4) | |
| date_listing = st.date_input("Listing Created", pd.to_datetime('2024-09-10T00:29:59+07:00')) | |
| feat_created = get_date_features(date_listing, "created_at") | |
| feat_shop = get_date_features(date_shop_open, "shop_open_since") | |
| listing_age_calc = (datetime.now().date() - date_listing).days | |
| base_input = { | |
| 'product_name': p_name, 'description': p_desc, 'category': category, | |
| 'condition_encoded': 1 if condition == 'New' else 0, | |
| 'weight_grams': weight, 'min_order': min_order, 'max_order': max_order, | |
| 'is_preorder': 1 if is_preorder else 0, | |
| 'is_discount': 1 if is_discount else 0, | |
| 'is_regular_merchant': 1 if shop_tier == "Regular Merchant" else 0, | |
| 'is_power_merchant': 1 if shop_tier == "Power Merchant" else 0, | |
| 'is_official_store': 1 if shop_tier == "Official Store" else 0, | |
| 'video_count': vid_count, | |
| 'image_count': img_count, | |
| 'can_shipping_instant': 1 if ship_inst else 0, | |
| 'can_shipping_sameday': 1 if ship_same else 0, | |
| 'can_shipping_next_day': 1 if ship_next else 0, | |
| 'can_shipping_regular': 1 if ship_reg else 0, | |
| 'can_shipping_cargo': 1 if ship_cargo else 0, | |
| 'can_shipping_economy': 1 if ship_eco else 0, | |
| 'total_shipping_types': total_shipping, | |
| 'rating': avg_rating, | |
| 'total_product_rating': total_rating_count, | |
| 'total_product_review': total_reviews, | |
| 'total_product_review_with_image': reviews_w_img, | |
| 'buyer_satisfaction_percentage': satisfaction, | |
| 'product_rating_5_star_count': r5, | |
| 'product_rating_4_star_count': r4, | |
| 'product_rating_3_star_count': r3, | |
| 'product_rating_2_star_count': r2, | |
| 'product_rating_1_star_count': r1, | |
| 'total_review_about_kualitas': rev_qual, | |
| 'total_review_about_pelayanan': rev_srv, | |
| 'total_review_about_kemasan': rev_pack, | |
| 'total_review_about_harga': rev_price, | |
| 'total_review_about_sesuai deskripsi': rev_desc, | |
| 'total_review_about_pengiriman': rev_ship, | |
| 'shop_rating_score': shop_rating_score, | |
| 'shop_total_rating': shop_total_rating, | |
| 'shop_rating_5': shop_r5, | |
| 'shop_rating_4': shop_r4, | |
| 'shop_rating_3': shop_r3, | |
| 'shop_rating_2': shop_r2, | |
| 'shop_rating_1': shop_r1, | |
| 'shop_response_time_mins': resp_time, | |
| 'shop_age_days': shop_age, | |
| 'listing_age_days': listing_age_calc, | |
| 'shop_popularity': shop_pop, | |
| 'shop_city_popularity': shop_city_pop, | |
| **feat_created, **feat_shop | |
| } | |
| base_input['Log_stock'] = np.log1p(stock) | |
| base_input['Log_sold'] = np.log1p(sold) | |
| st.divider() | |
| st.subheader("💰 Price Prediction") | |
| if price_models: | |
| model_name = st.selectbox("Select Model", list(price_models.keys()), label_visibility="collapsed") | |
| predict_btn = st.button("Predict Price", type="primary", use_container_width=True) | |
| if predict_btn: | |
| raw_df = prepare_raw_dataframe(base_input, price_cols, vectorizer) | |
| missing_cols = set(price_cols) - set(raw_df.columns) | |
| for c in missing_cols: | |
| raw_df[c] = 0 | |
| raw_df = raw_df[price_cols] | |
| X_scaled = price_preprocess.transform(raw_df) | |
| log_pred = price_models[model_name].predict(X_scaled)[0] | |
| price_pred = np.expm1(log_pred) | |
| st.success(f"Estimated Price: **Rp {price_pred:,.0f}**") | |
| with st.expander("Feature Importance Analysis", expanded=True): | |
| load_and_plot_importance(model_name, "price") | |
| else: | |
| st.error("Models failed to load.") | |