import streamlit as st
import pickle
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
# Load required models and vectorizers
with open("final_model.pkl", "rb") as f:
model = pickle.load(f)
with open("tfidf_vectorizer.pkl", "rb") as f:
tfidf_vectorizer = pickle.load(f)
with open("count_vectorizer.pkl", "rb") as f:
count_vectorizer = pickle.load(f)
stop_words = set(stopwords.words("english"))
# Streamlit setup
st.set_page_config(page_title="Stack Overflow Tag Predictor")
st.markdown(
"""
""",
unsafe_allow_html=True
)
st.title("🧠Stack Overflow Tag Predictor")
st.markdown("
", unsafe_allow_html=True)
# Preprocessing function
def clean_text(text):
if not isinstance(text, str):
return ""
text = BeautifulSoup(text, "html.parser").get_text()
text = re.sub(r"<.*?>", "", text)
text = re.sub(r"[^a-zA-Z\s]", "", text)
text = text.lower()
words = text.split()
words = [w for w in words if w not in stop_words and len(w) > 2]
return " ".join(words)
# Prediction function
def predict_tags(text):
cleaned = clean_text(text)
question_vec = tfidf_vectorizer.transform([cleaned])
prediction = model.predict(question_vec)
prediction_df = pd.DataFrame(prediction.toarray(), columns=count_vectorizer.get_feature_names_out())
tags = [col for col, val in zip(prediction_df.columns, prediction_df.iloc[0].values) if val == 1]
return tags
# User input
question = st.text_area("Enter your Stack Overflow question title and/or description", height=200)
if st.button("Predict Tags"):
if not question.strip():
st.warning("Please enter a question to predict tags.")
else:
predicted_tags = predict_tags(question)
st.subheader("✅ Predicted Tags:")
if predicted_tags:
for tag in predicted_tags:
st.success(f"#{tag}")
else:
st.info("No tags predicted. Try refining your question.")