Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from src.utils import helpers | |
| from src.config import settings | |
| st.set_page_config(page_title=settings.PROJECT_NAME, page_icon="🕵️", layout="centered") | |
| def load_cached_assets(): | |
| # Best-effort: ensure NLTK resources (silent failures allowed) | |
| try: | |
| helpers.ensure_nltk_resources() | |
| except Exception: | |
| pass | |
| # Return the full assets tuple from helpers.load_assets(). | |
| # Behavior: helpers.load_assets() prefers local files under `data/`. | |
| # If a file is missing and the environment variable `HF_ASSETS_REPO` is set, | |
| # the helper will attempt to download the missing artifact from the | |
| # specified Hugging Face repo and cache it to `data/remote_cache/`. | |
| assets = helpers.load_assets() | |
| return assets | |
| def _safe_predict(model, X): | |
| """Try to predict and return (pred, prob, error_str). | |
| On failure, attempt a dense fallback for sparse `X` and surface the exception message. | |
| """ | |
| if model is None: | |
| return None, None, None | |
| def _try_predict(input_X): | |
| pred = model.predict(input_X)[0] | |
| prob = None | |
| # First try predict_proba when available | |
| if hasattr(model, "predict_proba"): | |
| try: | |
| probs = model.predict_proba(input_X)[0] | |
| if hasattr(model, "classes_"): | |
| try: | |
| idx = int((model.classes_ == pred).nonzero()[0][0]) | |
| prob = float(probs[idx]) | |
| except Exception: | |
| prob = float(probs.max()) | |
| else: | |
| prob = float(probs.max()) | |
| except Exception: | |
| prob = None | |
| # If no predict_proba, try decision_function fallback for an approximate confidence | |
| elif hasattr(model, "decision_function"): | |
| try: | |
| score = model.decision_function(input_X) | |
| # decision_function can return (n_samples,) or (n_samples, n_classes) | |
| if hasattr(score, '__len__') and getattr(score, 'ndim', 0) == 1: | |
| score_val = float(score[0]) | |
| # convert distance to a pseudo-probability via a sigmoid | |
| prob_pos = 1.0 / (1.0 + __import__('math').exp(-score_val)) | |
| # If classes_ available, align probability to predicted class | |
| if hasattr(model, 'classes_') and len(model.classes_) >= 2: | |
| # assume classes_[1] corresponds to the positive side of decision_function | |
| if pred == model.classes_[1]: | |
| prob = float(prob_pos) | |
| else: | |
| prob = float(1.0 - prob_pos) | |
| else: | |
| prob = float(max(min(prob_pos, 1.0), 0.0)) | |
| else: | |
| # multi-dimensional decision function — skip | |
| prob = None | |
| except Exception: | |
| prob = None | |
| return pred, prob | |
| try: | |
| return (*_try_predict(X), None) | |
| except Exception as e1: | |
| # If X is sparse, try dense fallback | |
| try: | |
| if hasattr(X, "toarray"): | |
| X_dense = X.toarray() | |
| try: | |
| return (*_try_predict(X_dense), None) | |
| except Exception as e2: | |
| return None, None, f"predict failed: {e2}" | |
| except Exception: | |
| pass | |
| return None, None, f"predict failed: {e1}" | |
| def map_label(pred, model=None): | |
| """Map a raw model prediction to a human label. | |
| Supports both common encodings used in this repo: | |
| - {0,1} where 0 -> Negative, 1 -> Positive | |
| - {1,2} where 1 -> Negative, 2 -> Positive | |
| If `model` is provided and has `classes_`, we use that to disambiguate. | |
| """ | |
| if pred is None: | |
| return "Unknown" | |
| # If model provides classes_, prefer that mapping | |
| try: | |
| if model is not None and hasattr(model, 'classes_'): | |
| classes = tuple(model.classes_) | |
| if set(classes) == {0, 1}: | |
| p = int(pred) | |
| return "Negative" if p == 0 else "Positive" | |
| if set(classes) == {1, 2}: | |
| p = int(pred) | |
| return "Negative" if p == 1 else "Positive" | |
| except Exception: | |
| pass | |
| # Fallback heuristics | |
| try: | |
| p = int(pred) | |
| if p == 0: | |
| return "Negative" | |
| if p == 1: | |
| return "Positive" | |
| if p == 2: | |
| return "Positive" | |
| except Exception: | |
| pass | |
| if isinstance(pred, str): | |
| l = pred.lower() | |
| if "neg" in l: | |
| return "Negative" | |
| if "pos" in l: | |
| return "Positive" | |
| return str(pred) | |
| def main(): | |
| st.title(settings.PROJECT_NAME) | |
| if settings.PROJECT_DESCRIPTION: | |
| st.caption(settings.PROJECT_DESCRIPTION) | |
| assets = load_cached_assets() | |
| # unpack expected assets (helpers.load_assets returns many entries) | |
| vectorizer = assets[0] if len(assets) > 0 else None | |
| lr_model = assets[1] if len(assets) > 1 else None | |
| nb_model = assets[2] if len(assets) > 2 else None | |
| ft_svm_model = assets[3] if len(assets) > 3 else None | |
| linear_svm_model = assets[4] if len(assets) > 4 else None | |
| knn_model = assets[5] if len(assets) > 5 else None | |
| decision_tree_model = assets[6] if len(assets) > 6 else None | |
| random_forest_model = assets[7] if len(assets) > 7 else None | |
| sgd_model = assets[8] if len(assets) > 8 else None | |
| xgboost_model = assets[9] if len(assets) > 9 else None | |
| lightgbm_model = assets[10] if len(assets) > 10 else None | |
| st.markdown("---") | |
| st.subheader("Analyze a custom Amazon review") | |
| review = st.text_area("Paste a review here", height=200) | |
| _, btn_col, _ = st.columns([1, 2, 1]) | |
| analyze = btn_col.button("Analyze") | |
| if analyze: | |
| if not review or not str(review).strip(): | |
| st.warning("Please enter a review to analyze.") | |
| st.stop() | |
| try: | |
| cleaned = helpers.clean_text(str(review)) | |
| except Exception as e: | |
| st.error(f"Error during text cleaning: {e}") | |
| st.stop() | |
| if not cleaned: | |
| st.warning("Input text became empty after cleaning. Try a different review.") | |
| st.stop() | |
| if vectorizer is None: | |
| st.error("TF-IDF vectorizer not available. Ensure `data/vectorizers/tfidf_vectorizer.joblib` exists.") | |
| st.stop() | |
| try: | |
| X = vectorizer.transform([cleaned]) | |
| except Exception as e: | |
| st.error(f"Error during vectorization: {e}") | |
| st.stop() | |
| # collect models and display names in the desired order | |
| model_list = [ | |
| ("Logistic Regression", lr_model), | |
| ("Naive Bayes", nb_model), | |
| ("FT SVM", ft_svm_model), | |
| ("Linear SVM", linear_svm_model), | |
| ("KNN", knn_model), | |
| ("Decision Tree", decision_tree_model), | |
| ("Random Forest", random_forest_model), | |
| ("SGD", sgd_model), | |
| ("XGBoost", xgboost_model), | |
| ("LightGBM", lightgbm_model), | |
| ] | |
| cols = st.columns(3) | |
| for i, (name, model) in enumerate(model_list): | |
| col = cols[i % 3] | |
| with col: | |
| st.subheader(name) | |
| raw, prob, err = _safe_predict(model, X) | |
| label = map_label(raw, model) | |
| if label == "Positive": | |
| st.success(label) | |
| elif label == "Negative": | |
| st.error(label) | |
| elif label == "Unknown": | |
| if err: | |
| st.write("Model error:") | |
| st.caption(err) | |
| else: | |
| st.write("Model unavailable or prediction failed.") | |
| else: | |
| st.info(label) | |
| if prob is not None: | |
| st.caption(f"Confidence: {prob:.2%}") | |
| elif err: | |
| # show short error hint for debugging | |
| st.caption(err) | |
| st.markdown("---") | |
| st.subheader("Details") | |
| st.write("**Original**:") | |
| st.write(review) | |
| st.write("**Cleaned**:") | |
| st.write(cleaned) | |
| if __name__ == "__main__": | |
| main() | |