Spaces:

koushikvkr484
/

Multilingual_Hierarchical_Ticket_Classification

Sleeping

App Files Files Community

koushikvkr484 commited on Nov 20, 2025

Commit

2ae2548

verified ·

1 Parent(s): c62f10c

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -116

app.py CHANGED Viewed

@@ -8,136 +8,162 @@ import streamlit as st
 import tensorflow as tf
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
-from tensorflow.keras.preprocessing.text import Tokenizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
-# -----------------------------
-# Use TensorFlow's legacy loader
-# -----------------------------
-load_model = tf.keras.models.load_model   # IMPORTANT
-# -----------------------------
-# NLTK Requirements
-# -----------------------------
-# Custom NLTK directory
-NLTK_DIR = os.path.join(os.getcwd(), "nltk_data")
-os.makedirs(NLTK_DIR, exist_ok=True)
-nltk.data.path.append(NLTK_DIR)
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt', download_dir=NLTK_DIR)
-try:
-    nltk.data.find('corpora/stopwords')
-except LookupError:
-    nltk.download('stopwords', download_dir=NLTK_DIR)
-# Load stopwords NOW
-stop_english = set(stopwords.words("english"))
-# -----------------------------
-# Example text
-# -----------------------------
-st.write("Account Disruption")
-st.write("""Dear Customer Support Team,
-I am writing to report a significant problem with the centralized account management portal...
-""")
-# -----------------------------
-# Streamlit UI
-# -----------------------------
-st.title("Ticket Classification App")
-col1, col2 = st.columns(2)
-with col1:
-    subject = st.text_input("Enter your subject:")
-with col2:
-    body = st.text_input("Enter your body:")
-# -----------------------------
-# Load Model
-# -----------------------------
-model_path = "model.h5"
-model = load_model(model_path, compile=False)  # <- works on HF
-with open("le_type.pkl", "rb") as f:
-    le_type = pickle.load(f)
-with open("le_queue.pkl", "rb") as f:
-    le_queue = pickle.load(f)
-with open("mlb.pkl", "rb") as f:
-    mlb = pickle.load(f)
-# -----------------------------
-# Load Tokenizer
-# -----------------------------
-with open("tokenizer.pkl", "rb") as f:
-    tokenizer = pickle.load(f)
-MAX_SEQ_LEN = 107   # MUST match training
-# -----------------------------
-# Clean Text
-# -----------------------------
 def clean_text(t):
-    if pd.isna(t):
         return ""
     t = t.lower()
     tokens = word_tokenize(t)
-    tokens = [w for w in tokens if w not in stop_english and len(w) > 2]
     t = " ".join(tokens)
-    # regex cleaning
-    t = re.sub(r"<.*?>", " ", t)
-    t = re.sub(r"\\n", " ", t)
-    t = re.sub(r"http\S+|www\.\S+", " ", t)
-    t = re.sub(r"\S+@\S+", " ", t)
-    t = re.sub(r"[%\[\]_\\<\(\]#\?\'\":\)\-\;\+\!\/,>\.\n\r]", " ", t)
-    t = re.sub(r"\s+", " ", t).strip()
     return t
-# -----------------------------
-# Convert Text → Sequence
-# -----------------------------
 def convert_to_sequence(txt):
-    seq = tokenizer.texts_to_sequences([txt])  # must be list
-    padded = pad_sequences(seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre")
     return padded
-# -----------------------------
-# Prediction
-# -----------------------------
-if st.button("Submit"):
-    raw_text = subject + " " + body
-    cleaned = clean_text(raw_text)
-    st.write("Cleaned Text:", cleaned)
-    seq = convert_to_sequence(cleaned)
-    preds = model.predict(seq)
-    pred_type_probs, pred_queue_probs, pred_tags_probs = preds
-    # Decode single-label outputs
-    pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])
-    pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])
-    # Decode multi-label outputs
-    pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
-    pred_tags = mlb.inverse_transform(pred_tags_binary)
-    st.write("Predicted Type:", pred_type[0])
-    st.write("Predicted Queue:", pred_queue[0])
-    st.write("Predicted Tags:", pred_tags)

 import tensorflow as tf
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from tensorflow.keras.preprocessing.sequence import pad_sequences
+# Note: tokenizer from Keras is not strictly needed for loading,
+# but included for completeness if needed for re-training later.
+# --- IMPORTANT: TensorFlow Legacy Loader (Ensures compatibility) ---
+# Use TensorFlow's legacy loader for models
+load_model = tf.keras.models.load_model
+# --- NLTK Configuration for Hugging Face Spaces ---
+# HF Spaces use persistent storage, but downloading NLTK data on
+# startup is safer for fresh environment builds.
+@st.cache_resource
+def setup_nltk():
+    """Sets up NLTK data and returns English stopwords."""
+    # Define a temporary directory for NLTK if needed,
+    # but in HF spaces, it usually works by default or needs a specific path.
+    # We will let nltk handle the path for simplicity.
+    try:
+        nltk.data.find('tokenizers/punkt')
+    except LookupError:
+        nltk.download('punkt')
+    try:
+        nltk.data.find('corpora/stopwords')
+    except LookupError:
+        nltk.download('stopwords')
+    return set(stopwords.words("english"))
+stop_english = setup_nltk()
+# --- File Paths and Loading (CRITICAL for HF Spaces) ---
+# Ensure these files are uploaded to your Hugging Face repository
+# alongside this 'app.py' file.
+MODEL_PATH = "model.h5"
+LE_TYPE_PATH = "le_type.pkl"
+LE_QUEUE_PATH = "le_queue.pkl"
+MLB_PATH = "mlb.pkl"
+TOKENIZER_PATH = "tokenizer.pkl"
+MAX_SEQ_LEN = 107  # MUST match training
+@st.cache_resource
+def load_resources():
+    """Loads all model artifacts, including the model and preprocessors."""
+    try:
+        # Load Model
+        # compile=False is necessary if custom objects were not compiled in
+        model = load_model(MODEL_PATH, compile=False)
+        # Load Pickles
+        with open(LE_TYPE_PATH, "rb") as f:
+            le_type = pickle.load(f)
+        with open(LE_QUEUE_PATH, "rb") as f:
+            le_queue = pickle.load(f)
+        with open(MLB_PATH, "rb") as f:
+            mlb = pickle.load(f)
+        with open(TOKENIZER_PATH, "rb") as f:
+            tokenizer = pickle.load(f)
+        return model, le_type, le_queue, mlb, tokenizer
+    except FileNotFoundError as e:
+        st.error(f"Required file not found: {e}. Please ensure all artifacts (model.h5, *.pkl) are uploaded.")
+        st.stop()
+    except Exception as e:
+        st.error(f"An error occurred while loading resources: {e}")
+        st.stop()
+model, le_type, le_queue, mlb, tokenizer = load_resources()
+# --- Text Preprocessing Functions ---
 def clean_text(t):
+    """Performs text cleaning for a given string."""
+    if pd.isna(t) or t is None:
         return ""
     t = t.lower()
+    # Tokenize and remove stopwords/short words
     tokens = word_tokenize(t)
+    tokens = [w for w in tokens if w not in stop_english and len(w) > 2 and w.isalnum()]
     t = " ".join(tokens)
+    # Regex cleaning (simplified and adjusted)
+    # Removing common non-alphanumeric noise, URLs, and emails.
+    t = re.sub(r"http\S+|www\.\S+|@\S+|\\n", " ", t)  # URLs, emails, newlines
+    # Removing most punctuation but keeping spaces
+    t = re.sub(r"[^a-zA-Z0-9\s]", " ", t)
+    t = re.sub(r"\s+", " ", t).strip() # Consolidate spaces
     return t
 def convert_to_sequence(txt):
+    """Converts cleaned text to a padded sequence."""
+    seq = tokenizer.texts_to_sequences([txt])  # Input must be a list
+    padded = pad_sequences(
+        seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre"
+    )
     return padded
+# --- Streamlit UI ---
+st.set_page_config(page_title="Ticket Classification")
+st.title("🎫 Ticket Classification App")
+# Example Text Display
+st.header("Example Input")
+st.markdown("**Subject:** Account Disruption")
+st.code("""Dear Customer Support Team,
+I am writing to report a significant problem with the centralized account management portal...""")
+st.write("---")
+# Input Fields
+col1, col2 = st.columns(2)
+with col1:
+    subject = st.text_input("Enter your **Subject**:", key="subject_input")
+with col2:
+    body = st.text_area("Enter your **Body**:", key="body_input", height=100)
+# --- Prediction Logic ---
+if st.button("Submit"):
+    if not subject and not body:
+        st.warning("Please enter a subject or body text to classify.")
+    else:
+        # Combine and Clean
+        raw_text = subject + " " + body
+        cleaned = clean_text(raw_text)
+        st.subheader("Preprocessing Results")
+        st.info(f"**Cleaned Text:** {cleaned}")
+        # Convert and Predict
+        seq = convert_to_sequence(cleaned)
+        with st.spinner("Classifying ticket..."):
+            preds = model.predict(seq, verbose=0)
+        pred_type_probs, pred_queue_probs, pred_tags_probs = preds
+        # 1. Decode single-label outputs
+        pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])[0]
+        pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])[0]
+        # 2. Decode multi-label outputs (Tags)
+        pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
+        # mlb.inverse_transform returns a list of tuples, so we take the first element (index 0)
+        pred_tags = mlb.inverse_transform(pred_tags_binary)[0]
+        st.success("✅ Classification Complete!")
+        st.subheader("Prediction Results")
+        st.metric("Predicted Type", pred_type)
+        st.metric("Predicted Queue", pred_queue)
+        if pred_tags:
+            st.markdown(f"**Predicted Tags:** {', '.join(pred_tags)}")
+        else:
+            st.markdown("**Predicted Tags:** No significant tags found.")