Archana731's picture
Update app.py
1d70bd8 verified
raw
history blame
5.94 kB
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
import re
import nltk
import pickle
import numpy as np
import pandas as pd
import streamlit as st
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Note: tokenizer from Keras is not strictly needed for loading,
# but included for completeness if needed for re-training later.
# --- IMPORTANT: TensorFlow Legacy Loader (Ensures compatibility) ---
# Use TensorFlow's legacy loader for models
load_model = tf.keras.models.load_model
# --- NLTK Configuration for Hugging Face Spaces ---
# HF Spaces use persistent storage, but downloading NLTK data on
# startup is safer for fresh environment builds.
@st.cache_resource
def setup_nltk():
"""Sets up NLTK data and returns English stopwords."""
# Define a temporary directory for NLTK if needed,
# but in HF spaces, it usually works by default or needs a specific path.
# We will let nltk handle the path for simplicity.
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
return set(stopwords.words("english"))
stop_english = setup_nltk()
# --- File Paths and Loading (CRITICAL for HF Spaces) ---
# Ensure these files are uploaded to your Hugging Face repository
# alongside this 'app.py' file.
MODEL_PATH = "model.h5"
LE_TYPE_PATH = "le_type.pkl"
LE_QUEUE_PATH = "le_queue.pkl"
MLB_PATH = "mlb.pkl"
TOKENIZER_PATH = "tokenizer.pkl"
MAX_SEQ_LEN = 107 # MUST match training
@st.cache_resource
def load_resources():
"""Loads all model artifacts, including the model and preprocessors."""
try:
# Load Model
# compile=False is necessary if custom objects were not compiled in
model = load_model(MODEL_PATH, compile=False)
# Load Pickles
with open(LE_TYPE_PATH, "rb") as f:
le_type = pickle.load(f)
with open(LE_QUEUE_PATH, "rb") as f:
le_queue = pickle.load(f)
with open(MLB_PATH, "rb") as f:
mlb = pickle.load(f)
with open(TOKENIZER_PATH, "rb") as f:
tokenizer = pickle.load(f)
return model, le_type, le_queue, mlb, tokenizer
except FileNotFoundError as e:
st.error(f"Required file not found: {e}. Please ensure all artifacts (model.h5, *.pkl) are uploaded.")
st.stop()
except Exception as e:
st.error(f"An error occurred while loading resources: {e}")
st.stop()
model, le_type, le_queue, mlb, tokenizer = load_resources()
# --- Text Preprocessing Functions ---
def clean_text(t):
"""Performs text cleaning for a given string."""
if pd.isna(t) or t is None:
return ""
t = t.lower()
# Tokenize and remove stopwords/short words
tokens = word_tokenize(t)
tokens = [w for w in tokens if w not in stop_english and len(w) > 2 and w.isalnum()]
t = " ".join(tokens)
# Regex cleaning (simplified and adjusted)
# Removing common non-alphanumeric noise, URLs, and emails.
t = re.sub(r"http\S+|www\.\S+|@\S+|\\n", " ", t) # URLs, emails, newlines
# Removing most punctuation but keeping spaces
t = re.sub(r"[^a-zA-Z0-9\s]", " ", t)
t = re.sub(r"\s+", " ", t).strip() # Consolidate spaces
return t
def convert_to_sequence(txt):
"""Converts cleaned text to a padded sequence."""
seq = tokenizer.texts_to_sequences([txt]) # Input must be a list
padded = pad_sequences(
seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre"
)
return padded
# --- Streamlit UI ---
st.set_page_config(page_title="Ticket Classification")
st.title("🎫 Ticket Classification App")
# Example Text Display
st.header("Example Input")
st.markdown("**Subject:** Account Disruption")
st.code("""Dear Customer Support Team,
I am writing to report a significant problem with the centralized account management portal...""")
st.write("---")
# Input Fields
col1, col2 = st.columns(2)
with col1:
subject = st.text_input("Enter your **Subject**:", key="subject_input")
with col2:
body = st.text_area("Enter your **Body**:", key="body_input", height=100)
# --- Prediction Logic ---
if st.button("Submit"):
if not subject and not body:
st.warning("Please enter a subject or body text to classify.")
else:
# Combine and Clean
raw_text = subject + " " + body
cleaned = clean_text(raw_text)
st.subheader("Preprocessing Results")
st.info(f"**Cleaned Text:** {cleaned}")
# Convert and Predict
seq = convert_to_sequence(cleaned)
with st.spinner("Classifying ticket..."):
preds = model.predict(seq, verbose=0)
pred_type_probs, pred_queue_probs, pred_tags_probs = preds
# 1. Decode single-label outputs
pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])[0]
pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])[0]
# 2. Decode multi-label outputs (Tags)
pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
# mlb.inverse_transform returns a list of tuples, so we take the first element (index 0)
pred_tags = mlb.inverse_transform(pred_tags_binary)[0]
st.success("✅ Classification Complete!")
st.subheader("Prediction Results")
st.metric("Predicted Type", pred_type)
st.metric("Predicted Queue", pred_queue)
if pred_tags:
st.markdown(f"**Predicted Tags:** {', '.join(pred_tags)}")
else:
st.markdown("**Predicted Tags:** No significant tags found.")