elsayedelmandoh's picture
create license, hf_loader, and hf_uploader file
b454d73
import streamlit as st
from src.utils import helpers
from src.config import settings
st.set_page_config(page_title=settings.PROJECT_NAME, page_icon="🕵️", layout="centered")
@st.cache_resource
def load_cached_assets():
# Best-effort: ensure NLTK resources (silent failures allowed)
try:
helpers.ensure_nltk_resources()
except Exception:
pass
# Return the full assets tuple from helpers.load_assets().
# Behavior: helpers.load_assets() prefers local files under `data/`.
# If a file is missing and the environment variable `HF_ASSETS_REPO` is set,
# the helper will attempt to download the missing artifact from the
# specified Hugging Face repo and cache it to `data/remote_cache/`.
assets = helpers.load_assets()
return assets
def _safe_predict(model, X):
"""Try to predict and return (pred, prob, error_str).
On failure, attempt a dense fallback for sparse `X` and surface the exception message.
"""
if model is None:
return None, None, None
def _try_predict(input_X):
pred = model.predict(input_X)[0]
prob = None
# First try predict_proba when available
if hasattr(model, "predict_proba"):
try:
probs = model.predict_proba(input_X)[0]
if hasattr(model, "classes_"):
try:
idx = int((model.classes_ == pred).nonzero()[0][0])
prob = float(probs[idx])
except Exception:
prob = float(probs.max())
else:
prob = float(probs.max())
except Exception:
prob = None
# If no predict_proba, try decision_function fallback for an approximate confidence
elif hasattr(model, "decision_function"):
try:
score = model.decision_function(input_X)
# decision_function can return (n_samples,) or (n_samples, n_classes)
if hasattr(score, '__len__') and getattr(score, 'ndim', 0) == 1:
score_val = float(score[0])
# convert distance to a pseudo-probability via a sigmoid
prob_pos = 1.0 / (1.0 + __import__('math').exp(-score_val))
# If classes_ available, align probability to predicted class
if hasattr(model, 'classes_') and len(model.classes_) >= 2:
# assume classes_[1] corresponds to the positive side of decision_function
if pred == model.classes_[1]:
prob = float(prob_pos)
else:
prob = float(1.0 - prob_pos)
else:
prob = float(max(min(prob_pos, 1.0), 0.0))
else:
# multi-dimensional decision function — skip
prob = None
except Exception:
prob = None
return pred, prob
try:
return (*_try_predict(X), None)
except Exception as e1:
# If X is sparse, try dense fallback
try:
if hasattr(X, "toarray"):
X_dense = X.toarray()
try:
return (*_try_predict(X_dense), None)
except Exception as e2:
return None, None, f"predict failed: {e2}"
except Exception:
pass
return None, None, f"predict failed: {e1}"
def map_label(pred, model=None):
"""Map a raw model prediction to a human label.
Supports both common encodings used in this repo:
- {0,1} where 0 -> Negative, 1 -> Positive
- {1,2} where 1 -> Negative, 2 -> Positive
If `model` is provided and has `classes_`, we use that to disambiguate.
"""
if pred is None:
return "Unknown"
# If model provides classes_, prefer that mapping
try:
if model is not None and hasattr(model, 'classes_'):
classes = tuple(model.classes_)
if set(classes) == {0, 1}:
p = int(pred)
return "Negative" if p == 0 else "Positive"
if set(classes) == {1, 2}:
p = int(pred)
return "Negative" if p == 1 else "Positive"
except Exception:
pass
# Fallback heuristics
try:
p = int(pred)
if p == 0:
return "Negative"
if p == 1:
return "Positive"
if p == 2:
return "Positive"
except Exception:
pass
if isinstance(pred, str):
l = pred.lower()
if "neg" in l:
return "Negative"
if "pos" in l:
return "Positive"
return str(pred)
def main():
st.title(settings.PROJECT_NAME)
if settings.PROJECT_DESCRIPTION:
st.caption(settings.PROJECT_DESCRIPTION)
assets = load_cached_assets()
# unpack expected assets (helpers.load_assets returns many entries)
vectorizer = assets[0] if len(assets) > 0 else None
lr_model = assets[1] if len(assets) > 1 else None
nb_model = assets[2] if len(assets) > 2 else None
ft_svm_model = assets[3] if len(assets) > 3 else None
linear_svm_model = assets[4] if len(assets) > 4 else None
knn_model = assets[5] if len(assets) > 5 else None
decision_tree_model = assets[6] if len(assets) > 6 else None
random_forest_model = assets[7] if len(assets) > 7 else None
sgd_model = assets[8] if len(assets) > 8 else None
xgboost_model = assets[9] if len(assets) > 9 else None
lightgbm_model = assets[10] if len(assets) > 10 else None
st.markdown("---")
st.subheader("Analyze a custom Amazon review")
review = st.text_area("Paste a review here", height=200)
_, btn_col, _ = st.columns([1, 2, 1])
analyze = btn_col.button("Analyze")
if analyze:
if not review or not str(review).strip():
st.warning("Please enter a review to analyze.")
st.stop()
try:
cleaned = helpers.clean_text(str(review))
except Exception as e:
st.error(f"Error during text cleaning: {e}")
st.stop()
if not cleaned:
st.warning("Input text became empty after cleaning. Try a different review.")
st.stop()
if vectorizer is None:
st.error("TF-IDF vectorizer not available. Ensure `data/vectorizers/tfidf_vectorizer.joblib` exists.")
st.stop()
try:
X = vectorizer.transform([cleaned])
except Exception as e:
st.error(f"Error during vectorization: {e}")
st.stop()
# collect models and display names in the desired order
model_list = [
("Logistic Regression", lr_model),
("Naive Bayes", nb_model),
("FT SVM", ft_svm_model),
("Linear SVM", linear_svm_model),
("KNN", knn_model),
("Decision Tree", decision_tree_model),
("Random Forest", random_forest_model),
("SGD", sgd_model),
("XGBoost", xgboost_model),
("LightGBM", lightgbm_model),
]
cols = st.columns(3)
for i, (name, model) in enumerate(model_list):
col = cols[i % 3]
with col:
st.subheader(name)
raw, prob, err = _safe_predict(model, X)
label = map_label(raw, model)
if label == "Positive":
st.success(label)
elif label == "Negative":
st.error(label)
elif label == "Unknown":
if err:
st.write("Model error:")
st.caption(err)
else:
st.write("Model unavailable or prediction failed.")
else:
st.info(label)
if prob is not None:
st.caption(f"Confidence: {prob:.2%}")
elif err:
# show short error hint for debugging
st.caption(err)
st.markdown("---")
st.subheader("Details")
st.write("**Original**:")
st.write(review)
st.write("**Cleaned**:")
st.write(cleaned)
if __name__ == "__main__":
main()