import os os.environ["CUDA_VISIBLE_DEVICES"] = "-1" os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" import re import nltk import pickle import numpy as np import pandas as pd import streamlit as st import tensorflow as tf from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from tensorflow.keras.preprocessing.sequence import pad_sequences # Note: tokenizer from Keras is not strictly needed for loading, # but included for completeness if needed for re-training later. # --- IMPORTANT: TensorFlow Legacy Loader (Ensures compatibility) --- # Use TensorFlow's legacy loader for models load_model = tf.keras.models.load_model # --- NLTK Configuration for Hugging Face Spaces --- # HF Spaces use persistent storage, but downloading NLTK data on # startup is safer for fresh environment builds. @st.cache_resource def setup_nltk(): """Sets up NLTK data and returns English stopwords.""" # Define a temporary directory for NLTK if needed, # but in HF spaces, it usually works by default or needs a specific path. # We will let nltk handle the path for simplicity. try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') return set(stopwords.words("english")) stop_english = setup_nltk() # --- File Paths and Loading (CRITICAL for HF Spaces) --- # Ensure these files are uploaded to your Hugging Face repository # alongside this 'app.py' file. MODEL_PATH = "model.h5" LE_TYPE_PATH = "le_type.pkl" LE_QUEUE_PATH = "le_queue.pkl" MLB_PATH = "mlb.pkl" TOKENIZER_PATH = "tokenizer.pkl" MAX_SEQ_LEN = 107 # MUST match training @st.cache_resource def load_resources(): """Loads all model artifacts, including the model and preprocessors.""" try: # Load Model # compile=False is necessary if custom objects were not compiled in model = load_model(MODEL_PATH, compile=False) # Load Pickles with open(LE_TYPE_PATH, "rb") as f: le_type = pickle.load(f) with open(LE_QUEUE_PATH, "rb") as f: le_queue = pickle.load(f) with open(MLB_PATH, "rb") as f: mlb = pickle.load(f) with open(TOKENIZER_PATH, "rb") as f: tokenizer = pickle.load(f) return model, le_type, le_queue, mlb, tokenizer except FileNotFoundError as e: st.error(f"Required file not found: {e}. Please ensure all artifacts (model.h5, *.pkl) are uploaded.") st.stop() except Exception as e: st.error(f"An error occurred while loading resources: {e}") st.stop() model, le_type, le_queue, mlb, tokenizer = load_resources() # --- Text Preprocessing Functions --- def clean_text(t): """Performs text cleaning for a given string.""" if pd.isna(t) or t is None: return "" t = t.lower() # Tokenize and remove stopwords/short words tokens = word_tokenize(t) tokens = [w for w in tokens if w not in stop_english and len(w) > 2 and w.isalnum()] t = " ".join(tokens) # Regex cleaning (simplified and adjusted) # Removing common non-alphanumeric noise, URLs, and emails. t = re.sub(r"http\S+|www\.\S+|@\S+|\\n", " ", t) # URLs, emails, newlines # Removing most punctuation but keeping spaces t = re.sub(r"[^a-zA-Z0-9\s]", " ", t) t = re.sub(r"\s+", " ", t).strip() # Consolidate spaces return t def convert_to_sequence(txt): """Converts cleaned text to a padded sequence.""" seq = tokenizer.texts_to_sequences([txt]) # Input must be a list padded = pad_sequences( seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre" ) return padded # --- Streamlit UI --- st.set_page_config(page_title="Ticket Classification") st.title("🎫 Ticket Classification App") # Example Text Display st.header("Example Input") st.markdown("**Subject:** Account Disruption") st.code("""Dear Customer Support Team, I am writing to report a significant problem with the centralized account management portal...""") st.write("---") # Input Fields col1, col2 = st.columns(2) with col1: subject = st.text_input("Enter your **Subject**:", key="subject_input") with col2: body = st.text_area("Enter your **Body**:", key="body_input", height=100) # --- Prediction Logic --- if st.button("Submit"): if not subject and not body: st.warning("Please enter a subject or body text to classify.") else: # Combine and Clean raw_text = subject + " " + body cleaned = clean_text(raw_text) st.subheader("Preprocessing Results") st.info(f"**Cleaned Text:** {cleaned}") # Convert and Predict seq = convert_to_sequence(cleaned) with st.spinner("Classifying ticket..."): preds = model.predict(seq, verbose=0) pred_type_probs, pred_queue_probs, pred_tags_probs = preds # 1. Decode single-label outputs pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])[0] pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])[0] # 2. Decode multi-label outputs (Tags) pred_tags_binary = (pred_tags_probs >= 0.5).astype(int) # mlb.inverse_transform returns a list of tuples, so we take the first element (index 0) pred_tags = mlb.inverse_transform(pred_tags_binary)[0] st.success("✅ Classification Complete!") st.subheader("Prediction Results") st.metric("Predicted Type", pred_type) st.metric("Predicted Queue", pred_queue) if pred_tags: st.markdown(f"**Predicted Tags:** {', '.join(pred_tags)}") else: st.markdown("**Predicted Tags:** No significant tags found.")