Spaces:

koushikvkr484
/

Multilingual_Hierarchical_Ticket_Classification

Sleeping

App Files Files Community

koushikvkr484 commited on Nov 20, 2025

Commit

51b92e6

verified ·

1 Parent(s): ca87583

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -91

app.py CHANGED Viewed

@@ -8,135 +8,181 @@ import streamlit as st
 import tensorflow as tf
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
-from tensorflow.keras.preprocessing.text import Tokenizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
-# -----------------------------
-# Use TensorFlow's legacy loader
-# -----------------------------
-load_model = tf.keras.models.load_model   # IMPORTANT
-# -----------------------------
-# NLTK Requirements
-# -----------------------------
-# Custom NLTK directory
 NLTK_DIR = os.path.join(os.getcwd(), "nltk_data")
 os.makedirs(NLTK_DIR, exist_ok=True)
 nltk.data.path.append(NLTK_DIR)
-# Download punkt if missing
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
     nltk.download('punkt', download_dir=NLTK_DIR)
-# Download stopwords if missing
 try:
     nltk.data.find('corpora/stopwords')
 except LookupError:
     nltk.download('stopwords', download_dir=NLTK_DIR)
-# -----------------------------
-# Example text
-# -----------------------------
-st.write("Account Disruption")
-st.write("""Dear Customer Support Team,
-I am writing to report a significant problem with the centralized account management portal...
-""")
-# -----------------------------
-# Streamlit UI
-# -----------------------------
-st.title("Ticket Classification App")
-col1, col2 = st.columns(2)
-with col1:
-    subject = st.text_input("Enter your subject:")
-with col2:
-    body = st.text_input("Enter your body:")
-# -----------------------------
-# Load Model
-# -----------------------------
-model_path = "model.h5"
-model = load_model(model_path, compile=False)  # <- works on HF
-with open("le_type.pkl", "rb") as f:
-    le_type = pickle.load(f)
-with open("le_queue.pkl", "rb") as f:
-    le_queue = pickle.load(f)
-with open("mlb.pkl", "rb") as f:
-    mlb = pickle.load(f)
-# -----------------------------
-# Load Tokenizer
-# -----------------------------
-with open("tokenizer.pkl", "rb") as f:
-    tokenizer = pickle.load(f)
-MAX_SEQ_LEN = 107   # MUST match training
-# -----------------------------
-# Clean Text
-# -----------------------------
 def clean_text(t):
-    if pd.isna(t):
         return ""
-    t = t.lower()
     tokens = word_tokenize(t)
     tokens = [w for w in tokens if w not in stop_english and len(w) > 2]
     t = " ".join(tokens)
-    # regex cleaning
-    t = re.sub(r"<.*?>", " ", t)
-    t = re.sub(r"\\n", " ", t)
-    t = re.sub(r"http\S+|www\.\S+", " ", t)
-    t = re.sub(r"\S+@\S+", " ", t)
-    t = re.sub(r"[%\[\]_\\<\(\]#\?\'\":\)\-\;\+\!\/,>\.\n\r]", " ", t)
-    t = re.sub(r"\s+", " ", t).strip()
     return t
-# -----------------------------
-# Convert Text → Sequence
-# -----------------------------
 def convert_to_sequence(txt):
-    seq = tokenizer.texts_to_sequences([txt])  # must be list
     padded = pad_sequences(seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre")
     return padded
-# -----------------------------
-# Prediction
-# -----------------------------
-if st.button("Submit"):
-    raw_text = subject + " " + body
-    cleaned = clean_text(raw_text)
-    st.write("Cleaned Text:", cleaned)
-    seq = convert_to_sequence(cleaned)
-    preds = model.predict(seq)
-    pred_type_probs, pred_queue_probs, pred_tags_probs = preds
-    # Decode single-label outputs
-    pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])
-    pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])
-    # Decode multi-label outputs
-    pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
-    pred_tags = mlb.inverse_transform(pred_tags_binary)
-    st.write("Predicted Type:", pred_type[0])
-    st.write("Predicted Queue:", pred_queue[0])
-    st.write("Predicted Tags:", pred_tags)

 import tensorflow as tf
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from tensorflow.keras.preprocessing.sequence import pad_sequences
+# Note: Tokenizer import is not strictly needed here since it's loaded from file,
+# but it was in your original code, so it is kept for completeness.
+# from tensorflow.keras.preprocessing.text import Tokenizer
+## -----------------------------
+## 📦 Setup and Configuration
+## -----------------------------
+# Use TensorFlow's legacy loader for compatibility
+load_model = tf.keras.models.load_model   # IMPORTANT for older Streamlit/TensorFlow versions
+# English stop words list
+stop_english = set(stopwords.words('english'))
+# Custom NLTK directory setup
 NLTK_DIR = os.path.join(os.getcwd(), "nltk_data")
 os.makedirs(NLTK_DIR, exist_ok=True)
 nltk.data.path.append(NLTK_DIR)
+# Download NLTK resources if missing
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
     nltk.download('punkt', download_dir=NLTK_DIR)
 try:
     nltk.data.find('corpora/stopwords')
 except LookupError:
     nltk.download('stopwords', download_dir=NLTK_DIR)
+# Configuration must be set before Streamlit components are defined
+st.set_page_config(
+    page_title="Ticket Classification App",
+    layout="centered",
+    initial_sidebar_state="auto"
+)
+## -----------------------------
+## ⚙️ Load Model, Tokenizer, and Encoders
+## -----------------------------
+@st.cache_resource
+def load_assets():
+    """Loads all necessary machine learning assets."""
+    try:
+        # Load Model
+        model_path = "model.h5"
+        model = load_model(model_path, compile=False)
+        # Load Encoders
+        with open("le_type.pkl", "rb") as f:
+            le_type = pickle.load(f)
+        with open("le_queue.pkl", "rb") as f:
+            le_queue = pickle.load(f)
+        with open("mlb.pkl", "rb") as f:
+            mlb = pickle.load(f)
+        # Load Tokenizer
+        with open("tokenizer.pkl", "rb") as f:
+            tokenizer = pickle.load(f)
+        return model, le_type, le_queue, mlb, tokenizer
+    except FileNotFoundError as e:
+        st.error(f"Missing required file: {e.filename}. Please ensure 'model.h5', 'tokenizer.pkl', 'le_type.pkl', 'le_queue.pkl', and 'mlb.pkl' are in the same directory.")
+        st.stop()
+    except Exception as e:
+        st.error(f"An error occurred during asset loading: {e}")
+        st.stop()
+model, le_type, le_queue, mlb, tokenizer = load_assets()
+MAX_SEQ_LEN = 107   # MUST match training parameter
+## -----------------------------
+## 🧼 Text Cleaning and Preparation Functions
+## -----------------------------
 def clean_text(t):
+    """Performs text cleaning including lowercasing, stop word removal, and regex cleaning."""
+    if pd.isna(t) or t is None:
         return ""
+    t = str(t).lower()
+    # Tokenization and Stop Word Removal
     tokens = word_tokenize(t)
     tokens = [w for w in tokens if w not in stop_english and len(w) > 2]
     t = " ".join(tokens)
+    # Regex cleaning
+    t = re.sub(r"<.*?>", " ", t)                    # Remove HTML tags
+    t = re.sub(r"\\n", " ", t)                      # Remove literal \n
+    t = re.sub(r"http\S+|www\.\S+", " ", t)         # Remove URLs
+    t = re.sub(r"\S+@\S+", " ", t)                  # Remove emails
+    # Remove various punctuation and special characters
+    t = re.sub(r"[%\[\]_\\<\(\]#\?\'\":\)\-\;\+\!\/,>\.\n\r]", " ", t)
+    t = re.sub(r"\s+", " ", t).strip()              # Collapse multiple spaces and trim
     return t
 def convert_to_sequence(txt):
+    """Converts cleaned text to a padded sequence."""
+    seq = tokenizer.texts_to_sequences([txt])  # Tokenizer expects a list
     padded = pad_sequences(seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre")
     return padded
+## -----------------------------
+## 🖥️ Streamlit UI
+## -----------------------------
+st.title("🎫 Ticket Classification App")
+st.markdown("Enter the subject and body of a support ticket to predict its **Type**, **Queue**, and **Tags**.")
+# Example text display (for context/help)
+st.subheader("Example Input")
+st.code("""Subject: Account Disruption
+Body: Dear Customer Support Team, I am writing to report a significant problem with the centralized account management portal...""")
+st.markdown("---")
+col1, col2 = st.columns(2)
+with col1:
+    subject = st.text_input("Enter the **Subject**:", key="subject_input")
+with col2:
+    body = st.text_area("Enter the **Body**:", key="body_input", height=100)
+## -----------------------------
+## 🚀 Prediction Logic
+## -----------------------------
+if st.button("Submit for Classification"):
+    if not subject and not body:
+        st.warning("Please enter some text in the Subject or Body fields to submit.")
+    else:
+        with st.spinner('Classifying ticket...'):
+            # Combine and Clean Text
+            raw_text = subject + " " + body
+            cleaned = clean_text(raw_text)
+            if not cleaned:
+                st.warning("The input text was empty or contained only stop words/punctuation after cleaning.")
+            else:
+                # Convert to Sequence
+                seq = convert_to_sequence(cleaned)
+                # Make Prediction
+                preds = model.predict(seq, verbose=0)
+                pred_type_probs, pred_queue_probs, pred_tags_probs = preds
+                # Decode single-label outputs
+                pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])
+                pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])
+                # Decode multi-label outputs
+                pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
+                pred_tags = mlb.inverse_transform(pred_tags_binary)
+                # --- Display Results ---
+                st.success("Classification Complete!")
+                st.markdown("### Predicted Categories")
+                st.write(f"**Type:** `{pred_type[0]}`")
+                st.write(f"**Queue:** `{pred_queue[0]}`")
+                if pred_tags and pred_tags[0]:
+                    st.write(f"**Tags:** `{'`, `'.join(pred_tags[0])}`")
+                else:
+                    st.write("**Tags:** *No tags predicted (below threshold)*")
+                st.markdown("---")
+                st.markdown("### Preprocessing Details")
+                st.write("**Cleaned Text:**", cleaned)
+                # Optional: Show a preview of the probability scores for debugging
+                # st.write("Type Confidence:", np.max(pred_type_probs))
+                # st.write("Queue Confidence:", np.max(pred_queue_probs))