import streamlit as st import joblib import re import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import MultiLabelBinarizer nltk.download('punkt') nltk.download('stopwords') # --- Load artifacts --- model=joblib.load(r"E:\Stack-Overflow\models\model.pkl") vectorizer = joblib.load(r"E:\Stack-Overflow\models\vectorizer.pkl") # TF-IDF vectorizer mlb = joblib.load(r"E:\Stack-Overflow\models\mlb.pkl") # MultiLabelBinarizer # MultiLabelBinarizer # --- Preprocessing function --- def preprocess(text): text = text.lower() text = re.sub(r'[^a-z\s]', '', text) tokens = word_tokenize(text) stop_words = set(stopwords.words('english')) tokens = [t for t in tokens if t not in stop_words] return " ".join(tokens) # --- Streamlit UI --- st.set_page_config(page_title="Stack Overflow Tag Predictor", layout="centered") st.title("💬 Stack Overflow Tag Predictor") st.markdown("Enter a question (title + body) and get predicted tags.") user_input = st.text_area("✍️ Question Title + Body", height=200) top_k = st.slider("Number of tags to show", min_value=1, max_value=10, value=5) if st.button("Predict Tags") and user_input.strip(): cleaned = preprocess(user_input) X_vec = vectorizer.transform([cleaned]) y_pred_proba = model.predict_proba(X_vec) # Get top-k tag predictions top_indices = y_pred_proba[0].argsort()[-top_k:][::-1] predicted_tags = [mlb.classes_[i] for i in top_indices] confidence = [y_pred_proba[0][i] for i in top_indices] st.markdown("### 🏷️ Predicted Tags:") for tag, conf in zip(predicted_tags, confidence): st.markdown(f"- **{tag}** (confidence: {conf:.2f})")