ytrsoymr's picture
Rename app.py.py to app.py
177092e verified
import streamlit as st
import joblib
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
nltk.download('punkt')
nltk.download('stopwords')
# --- Load artifacts ---
model=joblib.load(r"E:\Stack-Overflow\models\model.pkl")
vectorizer = joblib.load(r"E:\Stack-Overflow\models\vectorizer.pkl") # TF-IDF vectorizer
mlb = joblib.load(r"E:\Stack-Overflow\models\mlb.pkl") # MultiLabelBinarizer
# MultiLabelBinarizer
# --- Preprocessing function ---
def preprocess(text):
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens = [t for t in tokens if t not in stop_words]
return " ".join(tokens)
# --- Streamlit UI ---
st.set_page_config(page_title="Stack Overflow Tag Predictor", layout="centered")
st.title("💬 Stack Overflow Tag Predictor")
st.markdown("Enter a question (title + body) and get predicted tags.")
user_input = st.text_area("✍️ Question Title + Body", height=200)
top_k = st.slider("Number of tags to show", min_value=1, max_value=10, value=5)
if st.button("Predict Tags") and user_input.strip():
cleaned = preprocess(user_input)
X_vec = vectorizer.transform([cleaned])
y_pred_proba = model.predict_proba(X_vec)
# Get top-k tag predictions
top_indices = y_pred_proba[0].argsort()[-top_k:][::-1]
predicted_tags = [mlb.classes_[i] for i in top_indices]
confidence = [y_pred_proba[0][i] for i in top_indices]
st.markdown("### 🏷️ Predicted Tags:")
for tag, conf in zip(predicted_tags, confidence):
st.markdown(f"- **{tag}** (confidence: {conf:.2f})")