koushikvkr484 commited on
Commit
2ae2548
·
verified ·
1 Parent(s): c62f10c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -116
app.py CHANGED
@@ -8,136 +8,162 @@ import streamlit as st
8
  import tensorflow as tf
9
  from nltk.corpus import stopwords
10
  from nltk.tokenize import word_tokenize
11
- from tensorflow.keras.preprocessing.text import Tokenizer
12
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
-
15
- # -----------------------------
16
- # Use TensorFlow's legacy loader
17
- # -----------------------------
18
- load_model = tf.keras.models.load_model # IMPORTANT
19
-
20
-
21
- # -----------------------------
22
- # NLTK Requirements
23
- # -----------------------------
24
- # Custom NLTK directory
25
- NLTK_DIR = os.path.join(os.getcwd(), "nltk_data")
26
- os.makedirs(NLTK_DIR, exist_ok=True)
27
- nltk.data.path.append(NLTK_DIR)
28
-
29
- try:
30
- nltk.data.find('tokenizers/punkt')
31
- except LookupError:
32
- nltk.download('punkt', download_dir=NLTK_DIR)
33
-
34
- try:
35
- nltk.data.find('corpora/stopwords')
36
- except LookupError:
37
- nltk.download('stopwords', download_dir=NLTK_DIR)
38
-
39
- # Load stopwords NOW
40
- stop_english = set(stopwords.words("english"))
41
-
42
- # -----------------------------
43
- # Example text
44
- # -----------------------------
45
- st.write("Account Disruption")
46
- st.write("""Dear Customer Support Team,
47
- I am writing to report a significant problem with the centralized account management portal...
48
- """)
49
-
50
- # -----------------------------
51
- # Streamlit UI
52
- # -----------------------------
53
- st.title("Ticket Classification App")
54
-
55
- col1, col2 = st.columns(2)
56
- with col1:
57
- subject = st.text_input("Enter your subject:")
58
- with col2:
59
- body = st.text_input("Enter your body:")
60
-
61
- # -----------------------------
62
- # Load Model
63
- # -----------------------------
64
- model_path = "model.h5"
65
- model = load_model(model_path, compile=False) # <- works on HF
66
-
67
- with open("le_type.pkl", "rb") as f:
68
- le_type = pickle.load(f)
69
-
70
- with open("le_queue.pkl", "rb") as f:
71
- le_queue = pickle.load(f)
72
-
73
- with open("mlb.pkl", "rb") as f:
74
- mlb = pickle.load(f)
75
-
76
- # -----------------------------
77
- # Load Tokenizer
78
- # -----------------------------
79
- with open("tokenizer.pkl", "rb") as f:
80
- tokenizer = pickle.load(f)
81
-
82
- MAX_SEQ_LEN = 107 # MUST match training
83
-
84
-
85
- # -----------------------------
86
- # Clean Text
87
- # -----------------------------
88
  def clean_text(t):
89
- if pd.isna(t):
 
90
  return ""
91
-
92
  t = t.lower()
 
93
  tokens = word_tokenize(t)
94
- tokens = [w for w in tokens if w not in stop_english and len(w) > 2]
95
  t = " ".join(tokens)
96
-
97
- # regex cleaning
98
- t = re.sub(r"<.*?>", " ", t)
99
- t = re.sub(r"\\n", " ", t)
100
- t = re.sub(r"http\S+|www\.\S+", " ", t)
101
- t = re.sub(r"\S+@\S+", " ", t)
102
- t = re.sub(r"[%\[\]_\\<\(\]#\?\'\":\)\-\;\+\!\/,>\.\n\r]", " ", t)
103
- t = re.sub(r"\s+", " ", t).strip()
104
-
105
  return t
106
 
107
-
108
- # -----------------------------
109
- # Convert Text → Sequence
110
- # -----------------------------
111
  def convert_to_sequence(txt):
112
- seq = tokenizer.texts_to_sequences([txt]) # must be list
113
- padded = pad_sequences(seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre")
 
 
 
114
  return padded
115
 
 
116
 
 
 
117
 
118
- # -----------------------------
119
- # Prediction
120
- # -----------------------------
121
- if st.button("Submit"):
122
- raw_text = subject + " " + body
123
-
124
- cleaned = clean_text(raw_text)
125
- st.write("Cleaned Text:", cleaned)
126
-
127
- seq = convert_to_sequence(cleaned)
128
-
129
- preds = model.predict(seq)
130
-
131
- pred_type_probs, pred_queue_probs, pred_tags_probs = preds
132
 
133
- # Decode single-label outputs
134
- pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])
135
- pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])
 
 
 
136
 
137
- # Decode multi-label outputs
138
- pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
139
- pred_tags = mlb.inverse_transform(pred_tags_binary)
140
 
141
- st.write("Predicted Type:", pred_type[0])
142
- st.write("Predicted Queue:", pred_queue[0])
143
- st.write("Predicted Tags:", pred_tags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import tensorflow as tf
9
  from nltk.corpus import stopwords
10
  from nltk.tokenize import word_tokenize
 
11
  from tensorflow.keras.preprocessing.sequence import pad_sequences
12
+ # Note: tokenizer from Keras is not strictly needed for loading,
13
+ # but included for completeness if needed for re-training later.
14
+
15
+ # --- IMPORTANT: TensorFlow Legacy Loader (Ensures compatibility) ---
16
+ # Use TensorFlow's legacy loader for models
17
+ load_model = tf.keras.models.load_model
18
+
19
+ # --- NLTK Configuration for Hugging Face Spaces ---
20
+ # HF Spaces use persistent storage, but downloading NLTK data on
21
+ # startup is safer for fresh environment builds.
22
+ @st.cache_resource
23
+ def setup_nltk():
24
+ """Sets up NLTK data and returns English stopwords."""
25
+ # Define a temporary directory for NLTK if needed,
26
+ # but in HF spaces, it usually works by default or needs a specific path.
27
+ # We will let nltk handle the path for simplicity.
28
+ try:
29
+ nltk.data.find('tokenizers/punkt')
30
+ except LookupError:
31
+ nltk.download('punkt')
32
+
33
+ try:
34
+ nltk.data.find('corpora/stopwords')
35
+ except LookupError:
36
+ nltk.download('stopwords')
37
+
38
+ return set(stopwords.words("english"))
39
+
40
+ stop_english = setup_nltk()
41
+
42
+ # --- File Paths and Loading (CRITICAL for HF Spaces) ---
43
+ # Ensure these files are uploaded to your Hugging Face repository
44
+ # alongside this 'app.py' file.
45
+ MODEL_PATH = "model.h5"
46
+ LE_TYPE_PATH = "le_type.pkl"
47
+ LE_QUEUE_PATH = "le_queue.pkl"
48
+ MLB_PATH = "mlb.pkl"
49
+ TOKENIZER_PATH = "tokenizer.pkl"
50
+ MAX_SEQ_LEN = 107 # MUST match training
51
+
52
+ @st.cache_resource
53
+ def load_resources():
54
+ """Loads all model artifacts, including the model and preprocessors."""
55
+ try:
56
+ # Load Model
57
+ # compile=False is necessary if custom objects were not compiled in
58
+ model = load_model(MODEL_PATH, compile=False)
59
+
60
+ # Load Pickles
61
+ with open(LE_TYPE_PATH, "rb") as f:
62
+ le_type = pickle.load(f)
63
+ with open(LE_QUEUE_PATH, "rb") as f:
64
+ le_queue = pickle.load(f)
65
+ with open(MLB_PATH, "rb") as f:
66
+ mlb = pickle.load(f)
67
+ with open(TOKENIZER_PATH, "rb") as f:
68
+ tokenizer = pickle.load(f)
69
+
70
+ return model, le_type, le_queue, mlb, tokenizer
71
+
72
+ except FileNotFoundError as e:
73
+ st.error(f"Required file not found: {e}. Please ensure all artifacts (model.h5, *.pkl) are uploaded.")
74
+ st.stop()
75
+ except Exception as e:
76
+ st.error(f"An error occurred while loading resources: {e}")
77
+ st.stop()
78
+
79
+ model, le_type, le_queue, mlb, tokenizer = load_resources()
80
+
81
+ # --- Text Preprocessing Functions ---
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  def clean_text(t):
84
+ """Performs text cleaning for a given string."""
85
+ if pd.isna(t) or t is None:
86
  return ""
87
+
88
  t = t.lower()
89
+ # Tokenize and remove stopwords/short words
90
  tokens = word_tokenize(t)
91
+ tokens = [w for w in tokens if w not in stop_english and len(w) > 2 and w.isalnum()]
92
  t = " ".join(tokens)
93
+
94
+ # Regex cleaning (simplified and adjusted)
95
+ # Removing common non-alphanumeric noise, URLs, and emails.
96
+ t = re.sub(r"http\S+|www\.\S+|@\S+|\\n", " ", t) # URLs, emails, newlines
97
+ # Removing most punctuation but keeping spaces
98
+ t = re.sub(r"[^a-zA-Z0-9\s]", " ", t)
99
+ t = re.sub(r"\s+", " ", t).strip() # Consolidate spaces
100
+
 
101
  return t
102
 
 
 
 
 
103
  def convert_to_sequence(txt):
104
+ """Converts cleaned text to a padded sequence."""
105
+ seq = tokenizer.texts_to_sequences([txt]) # Input must be a list
106
+ padded = pad_sequences(
107
+ seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre"
108
+ )
109
  return padded
110
 
111
+ # --- Streamlit UI ---
112
 
113
+ st.set_page_config(page_title="Ticket Classification")
114
+ st.title("🎫 Ticket Classification App")
115
 
116
+ # Example Text Display
117
+ st.header("Example Input")
118
+ st.markdown("**Subject:** Account Disruption")
119
+ st.code("""Dear Customer Support Team,
120
+ I am writing to report a significant problem with the centralized account management portal...""")
121
+ st.write("---")
 
 
 
 
 
 
 
 
122
 
123
+ # Input Fields
124
+ col1, col2 = st.columns(2)
125
+ with col1:
126
+ subject = st.text_input("Enter your **Subject**:", key="subject_input")
127
+ with col2:
128
+ body = st.text_area("Enter your **Body**:", key="body_input", height=100)
129
 
130
+ # --- Prediction Logic ---
 
 
131
 
132
+ if st.button("Submit"):
133
+ if not subject and not body:
134
+ st.warning("Please enter a subject or body text to classify.")
135
+ else:
136
+ # Combine and Clean
137
+ raw_text = subject + " " + body
138
+ cleaned = clean_text(raw_text)
139
+
140
+ st.subheader("Preprocessing Results")
141
+ st.info(f"**Cleaned Text:** {cleaned}")
142
+
143
+ # Convert and Predict
144
+ seq = convert_to_sequence(cleaned)
145
+
146
+ with st.spinner("Classifying ticket..."):
147
+ preds = model.predict(seq, verbose=0)
148
+
149
+ pred_type_probs, pred_queue_probs, pred_tags_probs = preds
150
+
151
+ # 1. Decode single-label outputs
152
+ pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])[0]
153
+ pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])[0]
154
+
155
+ # 2. Decode multi-label outputs (Tags)
156
+ pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
157
+ # mlb.inverse_transform returns a list of tuples, so we take the first element (index 0)
158
+ pred_tags = mlb.inverse_transform(pred_tags_binary)[0]
159
+
160
+ st.success("✅ Classification Complete!")
161
+
162
+ st.subheader("Prediction Results")
163
+ st.metric("Predicted Type", pred_type)
164
+ st.metric("Predicted Queue", pred_queue)
165
+
166
+ if pred_tags:
167
+ st.markdown(f"**Predicted Tags:** {', '.join(pred_tags)}")
168
+ else:
169
+ st.markdown("**Predicted Tags:** No significant tags found.")