|
|
import pandas as pd
|
|
|
import re
|
|
|
import nltk
|
|
|
from nltk.corpus import stopwords
|
|
|
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
nltk.download('stopwords')
|
|
|
nltk.download('punkt')
|
|
|
nltk.download('wordnet')
|
|
|
|
|
|
stop_words = set(stopwords.words('english'))
|
|
|
lemmatizer = WordNetLemmatizer()
|
|
|
|
|
|
def clean_text(text):
|
|
|
if pd.isna(text):
|
|
|
return ""
|
|
|
text = text.lower()
|
|
|
text = re.sub(r'[^a-z0-9\s]', ' ', text)
|
|
|
tokens = nltk.word_tokenize(text)
|
|
|
tokens = [w for w in tokens if w not in stop_words]
|
|
|
tokens = [lemmatizer.lemmatize(w) for w in tokens]
|
|
|
return " ".join(tokens)
|
|
|
|
|
|
def load_and_preprocess_data(filepath):
|
|
|
df = pd.read_excel(filepath)
|
|
|
|
|
|
df = df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level'])
|
|
|
df['clean_text'] = df['ticket_text'].apply(clean_text)
|
|
|
|
|
|
df['product'] = df['product'].fillna('')
|
|
|
return df
|
|
|
|