import pandas as pd import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() def clean_text(text): if pd.isna(text): return "" text = text.lower() text = re.sub(r'[^a-z0-9\s]', ' ', text) # remove special chars tokens = nltk.word_tokenize(text) tokens = [w for w in tokens if w not in stop_words] tokens = [lemmatizer.lemmatize(w) for w in tokens] return " ".join(tokens) def load_and_preprocess_data(filepath): df = pd.read_excel(filepath) # Drop rows with missing critical labels df = df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level']) df['clean_text'] = df['ticket_text'].apply(clean_text) # Fill missing product info with empty string df['product'] = df['product'].fillna('') return df