import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # remove special chars
    tokens = nltk.word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

def load_and_preprocess_data(filepath):
    df = pd.read_excel(filepath)
    # Drop rows with missing critical labels
    df = df.dropna(subset=['ticket_text', 'issue_type', 'urgency_level'])
    df['clean_text'] = df['ticket_text'].apply(clean_text)
    # Fill missing product info with empty string
    df['product'] = df['product'].fillna('')
    return df