Satyam0077's picture
Upload 4 files
b5c1242 verified
raw
history blame contribute delete
996 Bytes
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def clean_text(text):
if pd.isna(text):
return ""
text = text.lower()
text = re.sub(r'[^a-z0-9\s]', ' ', text) # remove special chars
tokens = nltk.word_tokenize(text)
tokens = [w for w in tokens if w not in stop_words]
tokens = [lemmatizer.lemmatize(w) for w in tokens]
return " ".join(tokens)
def load_and_preprocess_data(filepath):
df = pd.read_excel(filepath)
# Drop rows with missing critical labels
df = df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level'])
df['clean_text'] = df['ticket_text'].apply(clean_text)
# Fill missing product info with empty string
df['product'] = df['product'].fillna('')
return df