Spaces:

koushikvkr484
/

Multilingual_Hierarchical_Ticket_Classification

Sleeping

App Files Files Community

Multilingual_Hierarchical_Ticket_Classification / app.py

Archana731

Update app.py

1d70bd8 verified 4 months ago

raw

history blame

5.94 kB

	import os
	os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
	os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
	import re
	import nltk
	import pickle
	import numpy as np
	import pandas as pd
	import streamlit as st
	import tensorflow as tf
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	# Note: tokenizer from Keras is not strictly needed for loading,
	# but included for completeness if needed for re-training later.

	# --- IMPORTANT: TensorFlow Legacy Loader (Ensures compatibility) ---
	# Use TensorFlow's legacy loader for models
	load_model = tf.keras.models.load_model

	# --- NLTK Configuration for Hugging Face Spaces ---
	# HF Spaces use persistent storage, but downloading NLTK data on
	# startup is safer for fresh environment builds.
	@st.cache_resource
	def setup_nltk():
	"""Sets up NLTK data and returns English stopwords."""
	# Define a temporary directory for NLTK if needed,
	# but in HF spaces, it usually works by default or needs a specific path.
	# We will let nltk handle the path for simplicity.
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt')

	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords')

	return set(stopwords.words("english"))

	stop_english = setup_nltk()

	# --- File Paths and Loading (CRITICAL for HF Spaces) ---
	# Ensure these files are uploaded to your Hugging Face repository
	# alongside this 'app.py' file.
	MODEL_PATH = "model.h5"
	LE_TYPE_PATH = "le_type.pkl"
	LE_QUEUE_PATH = "le_queue.pkl"
	MLB_PATH = "mlb.pkl"
	TOKENIZER_PATH = "tokenizer.pkl"
	MAX_SEQ_LEN = 107 # MUST match training

	@st.cache_resource
	def load_resources():
	"""Loads all model artifacts, including the model and preprocessors."""
	try:
	# Load Model
	# compile=False is necessary if custom objects were not compiled in
	model = load_model(MODEL_PATH, compile=False)

	# Load Pickles
	with open(LE_TYPE_PATH, "rb") as f:
	le_type = pickle.load(f)
	with open(LE_QUEUE_PATH, "rb") as f:
	le_queue = pickle.load(f)
	with open(MLB_PATH, "rb") as f:
	mlb = pickle.load(f)
	with open(TOKENIZER_PATH, "rb") as f:
	tokenizer = pickle.load(f)

	return model, le_type, le_queue, mlb, tokenizer

	except FileNotFoundError as e:
	st.error(f"Required file not found: {e}. Please ensure all artifacts (model.h5, *.pkl) are uploaded.")
	st.stop()
	except Exception as e:
	st.error(f"An error occurred while loading resources: {e}")
	st.stop()

	model, le_type, le_queue, mlb, tokenizer = load_resources()

	# --- Text Preprocessing Functions ---

	def clean_text(t):
	"""Performs text cleaning for a given string."""
	if pd.isna(t) or t is None:
	return ""

	t = t.lower()
	# Tokenize and remove stopwords/short words
	tokens = word_tokenize(t)
	tokens = [w for w in tokens if w not in stop_english and len(w) > 2 and w.isalnum()]
	t = " ".join(tokens)

	# Regex cleaning (simplified and adjusted)
	# Removing common non-alphanumeric noise, URLs, and emails.
	t = re.sub(r"http\S+\|www\.\S+\|@\S+\|\\n", " ", t) # URLs, emails, newlines
	# Removing most punctuation but keeping spaces
	t = re.sub(r"[^a-zA-Z0-9\s]", " ", t)
	t = re.sub(r"\s+", " ", t).strip() # Consolidate spaces

	return t

	def convert_to_sequence(txt):
	"""Converts cleaned text to a padded sequence."""
	seq = tokenizer.texts_to_sequences([txt]) # Input must be a list
	padded = pad_sequences(
	seq, maxlen=MAX_SEQ_LEN, padding="pre", truncating="pre"
	)
	return padded

	# --- Streamlit UI ---

	st.set_page_config(page_title="Ticket Classification")
	st.title("🎫 Ticket Classification App")

	# Example Text Display
	st.header("Example Input")
	st.markdown("Subject: Account Disruption")
	st.code("""Dear Customer Support Team,
	I am writing to report a significant problem with the centralized account management portal...""")
	st.write("---")

	# Input Fields
	col1, col2 = st.columns(2)
	with col1:
	subject = st.text_input("Enter your Subject:", key="subject_input")
	with col2:
	body = st.text_area("Enter your Body:", key="body_input", height=100)

	# --- Prediction Logic ---

	if st.button("Submit"):
	if not subject and not body:
	st.warning("Please enter a subject or body text to classify.")
	else:
	# Combine and Clean
	raw_text = subject + " " + body
	cleaned = clean_text(raw_text)

	st.subheader("Preprocessing Results")
	st.info(f"Cleaned Text: {cleaned}")

	# Convert and Predict
	seq = convert_to_sequence(cleaned)

	with st.spinner("Classifying ticket..."):
	preds = model.predict(seq, verbose=0)

	pred_type_probs, pred_queue_probs, pred_tags_probs = preds

	# 1. Decode single-label outputs
	pred_type = le_type.inverse_transform([np.argmax(pred_type_probs)])[0]
	pred_queue = le_queue.inverse_transform([np.argmax(pred_queue_probs)])[0]

	# 2. Decode multi-label outputs (Tags)
	pred_tags_binary = (pred_tags_probs >= 0.5).astype(int)
	# mlb.inverse_transform returns a list of tuples, so we take the first element (index 0)
	pred_tags = mlb.inverse_transform(pred_tags_binary)[0]

	st.success("✅ Classification Complete!")

	st.subheader("Prediction Results")
	st.metric("Predicted Type", pred_type)
	st.metric("Predicted Queue", pred_queue)

	if pred_tags:
	st.markdown(f"Predicted Tags: {', '.join(pred_tags)}")
	else:
	st.markdown("Predicted Tags: No significant tags found.")