File size: 6,008 Bytes
9a311f2
ca87583
9a311f2
 
 
 
 
1870950
9a311f2
 
1ea8824
2ae2548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfdba32
 
 
 
 
 
2ae2548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ea8824
9a311f2
2ae2548
 
9a311f2
2ae2548
bcc174f
2ae2548
9a311f2
2ae2548
9a311f2
2ae2548
 
 
 
 
 
 
 
9a311f2
 
 
2ae2548
 
 
 
 
9a311f2
 
2ae2548
9a311f2
2ae2548
 
9a311f2
2ae2548
 
 
 
 
 
666b057
2ae2548
3955d37
ec7e2a5
3955d37
 
 
 
 
bcc174f
2ae2548
bcc174f
2ae2548
 
 
 
 
ec7e2a5
2ae2548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3955d37
2ae2548
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import re
import os
import nltk
import pickle
import numpy as np
import pandas as pd
import streamlit as st
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Note: tokenizer from Keras is not strictly needed for loading, 
# but included for completeness if needed for re-training later.

# --- IMPORTANT: TensorFlow Legacy Loader (Ensures compatibility) ---
# Use TensorFlow's legacy loader for models
load_model = tf.keras.models.load_model

# --- NLTK Configuration for Hugging Face Spaces ---
# HF Spaces use persistent storage, but downloading NLTK data on
# startup is safer for fresh environment builds.
@st.cache_resource
def setup_nltk():
    """Sets up NLTK data and returns English stopwords."""
    # Define a temporary directory for NLTK if needed, 
    # but in HF spaces, it usually works by default or needs a specific path.
    # We will let nltk handle the path for simplicity.
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
        
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
        
    return set(stopwords.words("english"))

stop_english = setup_nltk()

# --- File Paths and Loading (CRITICAL for HF Spaces) ---
# Ensure these files are uploaded to your Hugging Face repository 
# alongside this 'app.py' file.
MODEL_PATH = "model_ticket1.h5"
LE_TYPE_PATH = "le_type_ticket.pkl"
LE_QUEUE_PATH = "le_queue_ticket.pkl"
MLB_PATH = "mlb_ticket.pkl"
TOKENIZER_PATH = "tokenizer_ticket.pkl"
MAX_SEQ_LEN = 200  # MUST match training

@st.cache_resource
def load_resources():
    """Loads all model artifacts, including the model and preprocessors."""
    try:
        # Load Model
        # compile=False is necessary if custom objects were not compiled in
        model = load_model(MODEL_PATH, compile=False)
        
        # Load Pickles
        with open(LE_TYPE_PATH, "rb") as f:
            le_type = pickle.load(f)
        with open(LE_QUEUE_PATH, "rb") as f:
            le_queue = pickle.load(f)
        with open(MLB_PATH, "rb") as f:
            mlb = pickle.load(f)
        with open(TOKENIZER_PATH, "rb") as f:
            tokenizer = pickle.load(f)
            
        return model, le_type, le_queue, mlb, tokenizer
        
    except FileNotFoundError as e:
        st.error(f"Required file not found: {e}. Please ensure all artifacts (model.h5, *.pkl) are uploaded.")
        st.stop()
    except Exception as e:
        st.error(f"An error occurred while loading resources: {e}")
        st.stop()

model, le_type, le_queue, mlb, tokenizer = load_resources()

# --- Text Preprocessing Functions ---

def clean_text(t):
    """Performs text cleaning for a given string."""
    if pd.isna(t) or t is None:
        return ""
    
    t = t.lower()
    # Tokenize and remove stopwords/short words
    tokens = word_tokenize(t)
    tokens = [w for w in tokens if w not in stop_english and len(w) > 2 and w.isalnum()]
    t = " ".join(tokens)
    
    # Regex cleaning (simplified and adjusted)
    # Removing common non-alphanumeric noise, URLs, and emails.
    t = re.sub(r"http\S+|www\.\S+|@\S+|\\n", " ", t)  # URLs, emails, newlines
    # Removing most punctuation but keeping spaces
    t = re.sub(r"[^a-zA-Z0-9\s]", " ", t) 
    t = re.sub(r"\s+", " ", t).strip() # Consolidate spaces
    
    return t

def convert_to_sequence(txt):
    """Converts cleaned text to a padded sequence."""
    seq = tokenizer.texts_to_sequences([txt])  # Input must be a list
    padded = pad_sequences(
        seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre"
    )
    return padded

# --- Streamlit UI ---

st.set_page_config(page_title="Ticket Classification")
st.title("🎫 Ticket Classification App")

# Example Text Display
st.header("Example Input")
st.markdown("**Subject:** Account Disruption")
st.code("""Dear Customer Support Team,
I am writing to report a significant problem with the centralized account management portal...""")
st.write("---")

# Input Fields
body = st.text_area("Enter your **Subject** and **Body**:", key="subject_body_input", height=200)
subject = " "
#col1, col2 = st.columns(2)
#with col1:
#    subject = st.text_input("Enter your **Subject**:", key="subject_input")
#with col2:
#    body = st.text_area("Enter your **Body**:", key="body_input", height=100)

# --- Prediction Logic ---

if st.button("Submit"):
    if not subject and not body:
        st.warning("Please enter a subject or body text to classify.")
    else:
        # Combine and Clean
        raw_text =  body + " " + subject 
        cleaned = clean_text(raw_text)
        
        st.subheader("Preprocessing Results")
        st.info(f"**Cleaned Text:** {cleaned}")
        
        # Convert and Predict
        seq = convert_to_sequence(cleaned)
        
        with st.spinner("Classifying ticket..."):
            preds = model.predict(seq, verbose=0)
            
        pred_type_probs, pred_queue_probs, pred_tags_probs = preds
        
        # 1. Decode single-label outputs
        pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])[0]
        pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])[0]
        
        # 2. Decode multi-label outputs (Tags)
        pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
        # mlb.inverse_transform returns a list of tuples, so we take the first element (index 0)
        pred_tags = mlb.inverse_transform(pred_tags_binary)[0]
        
        st.success("✅ Classification Complete!")
        
        #st.subheader("Prediction Results")
        st.metric("Predicted Type", pred_type)
        st.metric("Predicted Queue", pred_queue)
        
        if pred_tags:
            st.markdown(f"**Predicted Tags:** {', '.join(pred_tags)}")
        else:
            st.markdown("**Predicted Tags:** No significant tags found.")