Spaces:

koushikvkr484
/

Multilingual_Hierarchical_Ticket_Classification

Sleeping

App Files Files Community

koushikvkr484 commited on Nov 20, 2025

Commit

bcc174f

verified ·

1 Parent(s): 51b92e6

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -137

app.py CHANGED Viewed

@@ -8,27 +8,24 @@ import streamlit as st
 import tensorflow as tf
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from tensorflow.keras.preprocessing.sequence import pad_sequences
-# Note: Tokenizer import is not strictly needed here since it's loaded from file,
-# but it was in your original code, so it is kept for completeness.
-# from tensorflow.keras.preprocessing.text import Tokenizer
-## -----------------------------
-## 📦 Setup and Configuration
-## -----------------------------
-# Use TensorFlow's legacy loader for compatibility
-load_model = tf.keras.models.load_model   # IMPORTANT for older Streamlit/TensorFlow versions
-# English stop words list
-stop_english = set(stopwords.words('english'))
-# Custom NLTK directory setup
 NLTK_DIR = os.path.join(os.getcwd(), "nltk_data")
 os.makedirs(NLTK_DIR, exist_ok=True)
 nltk.data.path.append(NLTK_DIR)
-# Download NLTK resources if missing
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
@@ -39,150 +36,108 @@ try:
 except LookupError:
     nltk.download('stopwords', download_dir=NLTK_DIR)
-# Configuration must be set before Streamlit components are defined
-st.set_page_config(
-    page_title="Ticket Classification App",
-    layout="centered",
-    initial_sidebar_state="auto"
-)
-## -----------------------------
-## ⚙️ Load Model, Tokenizer, and Encoders
-## -----------------------------
-@st.cache_resource
-def load_assets():
-    """Loads all necessary machine learning assets."""
-    try:
-        # Load Model
-        model_path = "model.h5"
-        model = load_model(model_path, compile=False)
-        # Load Encoders
-        with open("le_type.pkl", "rb") as f:
-            le_type = pickle.load(f)
-        with open("le_queue.pkl", "rb") as f:
-            le_queue = pickle.load(f)
-        with open("mlb.pkl", "rb") as f:
-            mlb = pickle.load(f)
-        # Load Tokenizer
-        with open("tokenizer.pkl", "rb") as f:
-            tokenizer = pickle.load(f)
-        return model, le_type, le_queue, mlb, tokenizer
-    except FileNotFoundError as e:
-        st.error(f"Missing required file: {e.filename}. Please ensure 'model.h5', 'tokenizer.pkl', 'le_type.pkl', 'le_queue.pkl', and 'mlb.pkl' are in the same directory.")
-        st.stop()
-    except Exception as e:
-        st.error(f"An error occurred during asset loading: {e}")
-        st.stop()
-model, le_type, le_queue, mlb, tokenizer = load_assets()
-MAX_SEQ_LEN = 107   # MUST match training parameter
-## -----------------------------
-## 🧼 Text Cleaning and Preparation Functions
-## -----------------------------
 def clean_text(t):
-    """Performs text cleaning including lowercasing, stop word removal, and regex cleaning."""
-    if pd.isna(t) or t is None:
         return ""
-    t = str(t).lower()
-    # Tokenization and Stop Word Removal
     tokens = word_tokenize(t)
     tokens = [w for w in tokens if w not in stop_english and len(w) > 2]
     t = " ".join(tokens)
-    # Regex cleaning
-    t = re.sub(r"<.*?>", " ", t)                    # Remove HTML tags
-    t = re.sub(r"\\n", " ", t)                      # Remove literal \n
-    t = re.sub(r"http\S+|www\.\S+", " ", t)         # Remove URLs
-    t = re.sub(r"\S+@\S+", " ", t)                  # Remove emails
-    # Remove various punctuation and special characters
-    t = re.sub(r"[%\[\]_\\<\(\]#\?\'\":\)\-\;\+\!\/,>\.\n\r]", " ", t)
-    t = re.sub(r"\s+", " ", t).strip()              # Collapse multiple spaces and trim
     return t
 def convert_to_sequence(txt):
-    """Converts cleaned text to a padded sequence."""
-    seq = tokenizer.texts_to_sequences([txt])  # Tokenizer expects a list
     padded = pad_sequences(seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre")
     return padded
-## -----------------------------
-## 🖥️ Streamlit UI
-## -----------------------------
-st.title("🎫 Ticket Classification App")
-st.markdown("Enter the subject and body of a support ticket to predict its **Type**, **Queue**, and **Tags**.")
-# Example text display (for context/help)
-st.subheader("Example Input")
-st.code("""Subject: Account Disruption
-Body: Dear Customer Support Team, I am writing to report a significant problem with the centralized account management portal...""")
-st.markdown("---")
-col1, col2 = st.columns(2)
-with col1:
-    subject = st.text_input("Enter the **Subject**:", key="subject_input")
-with col2:
-    body = st.text_area("Enter the **Body**:", key="body_input", height=100)
-## -----------------------------
-## 🚀 Prediction Logic
-## -----------------------------
-if st.button("Submit for Classification"):
-    if not subject and not body:
-        st.warning("Please enter some text in the Subject or Body fields to submit.")
-    else:
-        with st.spinner('Classifying ticket...'):
-            # Combine and Clean Text
-            raw_text = subject + " " + body
-            cleaned = clean_text(raw_text)
-            if not cleaned:
-                st.warning("The input text was empty or contained only stop words/punctuation after cleaning.")
-            else:
-                # Convert to Sequence
-                seq = convert_to_sequence(cleaned)
-                # Make Prediction
-                preds = model.predict(seq, verbose=0)
-                pred_type_probs, pred_queue_probs, pred_tags_probs = preds
-                # Decode single-label outputs
-                pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])
-                pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])
-                # Decode multi-label outputs
-                pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
-                pred_tags = mlb.inverse_transform(pred_tags_binary)
-                # --- Display Results ---
-                st.success("Classification Complete!")
-                st.markdown("### Predicted Categories")
-                st.write(f"**Type:** `{pred_type[0]}`")
-                st.write(f"**Queue:** `{pred_queue[0]}`")
-                if pred_tags and pred_tags[0]:
-                    st.write(f"**Tags:** `{'`, `'.join(pred_tags[0])}`")
-                else:
-                    st.write("**Tags:** *No tags predicted (below threshold)*")
-                st.markdown("---")
-                st.markdown("### Preprocessing Details")
-                st.write("**Cleaned Text:**", cleaned)
-                # Optional: Show a preview of the probability scores for debugging
-                # st.write("Type Confidence:", np.max(pred_type_probs))
-                # st.write("Queue Confidence:", np.max(pred_queue_probs))

 import tensorflow as tf
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
+from tensorflow.keras.preprocessing.text import Tokenizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
+# -----------------------------
+# Use TensorFlow's legacy loader
+# -----------------------------
+load_model = tf.keras.models.load_model   # IMPORTANT
+# -----------------------------
+# NLTK Requirements
+# -----------------------------
+# Custom NLTK directory
 NLTK_DIR = os.path.join(os.getcwd(), "nltk_data")
 os.makedirs(NLTK_DIR, exist_ok=True)
 nltk.data.path.append(NLTK_DIR)
 try:
     nltk.data.find('tokenizers/punkt')
 except LookupError:
 except LookupError:
     nltk.download('stopwords', download_dir=NLTK_DIR)
+# Load stopwords NOW
+stop_english = set(stopwords.words("english"))
+# -----------------------------
+# Example text
+# -----------------------------
+st.write("Account Disruption")
+st.write("""Dear Customer Support Team,
+I am writing to report a significant problem with the centralized account management portal...
+""")
+# -----------------------------
+# Streamlit UI
+# -----------------------------
+st.title("Ticket Classification App")
+col1, col2 = st.columns(2)
+with col1:
+    subject = st.text_input("Enter your subject:")
+with col2:
+    body = st.text_input("Enter your body:")
+# -----------------------------
+# Load Model
+# -----------------------------
+model_path = "model.h5"
+model = load_model(model_path, compile=False)  # <- works on HF
+with open("le_type.pkl", "rb") as f:
+    le_type = pickle.load(f)
+with open("le_queue.pkl", "rb") as f:
+    le_queue = pickle.load(f)
+with open("mlb.pkl", "rb") as f:
+    mlb = pickle.load(f)
+# -----------------------------
+# Load Tokenizer
+# -----------------------------
+with open("tokenizer.pkl", "rb") as f:
+    tokenizer = pickle.load(f)
+MAX_SEQ_LEN = 107   # MUST match training
+# -----------------------------
+# Clean Text
+# -----------------------------
 def clean_text(t):
+    if pd.isna(t):
         return ""
+    t = t.lower()
     tokens = word_tokenize(t)
     tokens = [w for w in tokens if w not in stop_english and len(w) > 2]
     t = " ".join(tokens)
+    # regex cleaning
+    t = re.sub(r"<.*?>", " ", t)
+    t = re.sub(r"\\n", " ", t)
+    t = re.sub(r"http\S+|www\.\S+", " ", t)
+    t = re.sub(r"\S+@\S+", " ", t)
+    t = re.sub(r"[%\[\]_\\<\(\]#\?\'\":\)\-\;\+\!\/,>\.\n\r]", " ", t)
+    t = re.sub(r"\s+", " ", t).strip()
     return t
+# -----------------------------
+# Convert Text → Sequence
+# -----------------------------
 def convert_to_sequence(txt):
+    seq = tokenizer.texts_to_sequences([txt])  # must be list
     padded = pad_sequences(seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre")
     return padded
+# -----------------------------
+# Prediction
+# -----------------------------
+if st.button("Submit"):
+    raw_text = subject + " " + body
+    cleaned = clean_text(raw_text)
+    st.write("Cleaned Text:", cleaned)
+    seq = convert_to_sequence(cleaned)
+    preds = model.predict(seq)
+    pred_type_probs, pred_queue_probs, pred_tags_probs = preds
+    # Decode single-label outputs
+    pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])
+    pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])
+    # Decode multi-label outputs
+    pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
+    pred_tags = mlb.inverse_transform(pred_tags_binary)
+    st.write("Predicted Type:", pred_type[0])
+    st.write("Predicted Queue:", pred_queue[0])
+    st.write("Predicted Tags:", pred_tags)